In [4]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from matplotlib import pyplot as plt
import seaborn as sns
import eli5
import csv

In [5]:
train= pd.read_csv(r'MoviesWithSynopsisAndCategory.csv').sample(frac=0.7)
test= pd.read_csv(r'MoviesWithSynopsisAndCategory.csv').sample(frac=0.3)

In [6]:
text_transformer = TfidfVectorizer( ngram_range=(1, 2),  max_features=150000)

In [7]:
%%time
X_train_text = text_transformer.fit_transform(train['ملخص'])
X_test_text = text_transformer.transform(test['ملخص'])

Wall time: 261 ms


In [8]:
X_train_text.shape,X_test_text.shape

((828, 43400), (355, 43400))

In [9]:
logit = LogisticRegression(C=5e1, solver='lbfgs', multi_class='multinomial', random_state=17, n_jobs=4)

In [10]:
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=17)


In [11]:
%%time
cv_results = cross_val_score(logit, X_train_text, train['تصنيف الفيلم'], cv=skf, scoring='f1_micro')



Wall time: 26.9 s


In [12]:
cv_results, cv_results.mean()


(array([0.48067633, 0.47826087]), 0.47946859903381644)

In [13]:
%%time
logit.fit(X_train_text, train['تصنيف الفيلم'])

Wall time: 15.1 s


LogisticRegression(C=50.0, multi_class='multinomial', n_jobs=4, random_state=17)

In [14]:
%%time
eli5.show_weights(estimator=logit, 
                  feature_names= list(text_transformer.get_feature_names()),
                 top=(50, 5))

Wall time: 318 ms


Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0,Unnamed: 19_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5,Unnamed: 17_level_5,Unnamed: 18_level_5,Unnamed: 19_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6,Unnamed: 17_level_6,Unnamed: 18_level_6,Unnamed: 19_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7,Unnamed: 17_level_7,Unnamed: 18_level_7,Unnamed: 19_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8,Unnamed: 17_level_8,Unnamed: 18_level_8,Unnamed: 19_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9,Unnamed: 15_level_9,Unnamed: 16_level_9,Unnamed: 17_level_9,Unnamed: 18_level_9,Unnamed: 19_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10,Unnamed: 16_level_10,Unnamed: 17_level_10,Unnamed: 18_level_10,Unnamed: 19_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11,Unnamed: 15_level_11,Unnamed: 16_level_11,Unnamed: 17_level_11,Unnamed: 18_level_11,Unnamed: 19_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12,Unnamed: 15_level_12,Unnamed: 16_level_12,Unnamed: 17_level_12,Unnamed: 18_level_12,Unnamed: 19_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13,Unnamed: 15_level_13,Unnamed: 16_level_13,Unnamed: 17_level_13,Unnamed: 18_level_13,Unnamed: 19_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14,Unnamed: 15_level_14,Unnamed: 16_level_14,Unnamed: 17_level_14,Unnamed: 18_level_14,Unnamed: 19_level_14
Weight?,Feature,Unnamed: 2_level_15,Unnamed: 3_level_15,Unnamed: 4_level_15,Unnamed: 5_level_15,Unnamed: 6_level_15,Unnamed: 7_level_15,Unnamed: 8_level_15,Unnamed: 9_level_15,Unnamed: 10_level_15,Unnamed: 11_level_15,Unnamed: 12_level_15,Unnamed: 13_level_15,Unnamed: 14_level_15,Unnamed: 15_level_15,Unnamed: 16_level_15,Unnamed: 17_level_15,Unnamed: 18_level_15,Unnamed: 19_level_15
Weight?,Feature,Unnamed: 2_level_16,Unnamed: 3_level_16,Unnamed: 4_level_16,Unnamed: 5_level_16,Unnamed: 6_level_16,Unnamed: 7_level_16,Unnamed: 8_level_16,Unnamed: 9_level_16,Unnamed: 10_level_16,Unnamed: 11_level_16,Unnamed: 12_level_16,Unnamed: 13_level_16,Unnamed: 14_level_16,Unnamed: 15_level_16,Unnamed: 16_level_16,Unnamed: 17_level_16,Unnamed: 18_level_16,Unnamed: 19_level_16
Weight?,Feature,Unnamed: 2_level_17,Unnamed: 3_level_17,Unnamed: 4_level_17,Unnamed: 5_level_17,Unnamed: 6_level_17,Unnamed: 7_level_17,Unnamed: 8_level_17,Unnamed: 9_level_17,Unnamed: 10_level_17,Unnamed: 11_level_17,Unnamed: 12_level_17,Unnamed: 13_level_17,Unnamed: 14_level_17,Unnamed: 15_level_17,Unnamed: 16_level_17,Unnamed: 17_level_17,Unnamed: 18_level_17,Unnamed: 19_level_17
Weight?,Feature,Unnamed: 2_level_18,Unnamed: 3_level_18,Unnamed: 4_level_18,Unnamed: 5_level_18,Unnamed: 6_level_18,Unnamed: 7_level_18,Unnamed: 8_level_18,Unnamed: 9_level_18,Unnamed: 10_level_18,Unnamed: 11_level_18,Unnamed: 12_level_18,Unnamed: 13_level_18,Unnamed: 14_level_18,Unnamed: 15_level_18,Unnamed: 16_level_18,Unnamed: 17_level_18,Unnamed: 18_level_18,Unnamed: 19_level_18
Weight?,Feature,Unnamed: 2_level_19,Unnamed: 3_level_19,Unnamed: 4_level_19,Unnamed: 5_level_19,Unnamed: 6_level_19,Unnamed: 7_level_19,Unnamed: 8_level_19,Unnamed: 9_level_19,Unnamed: 10_level_19,Unnamed: 11_level_19,Unnamed: 12_level_19,Unnamed: 13_level_19,Unnamed: 14_level_19,Unnamed: 15_level_19,Unnamed: 16_level_19,Unnamed: 17_level_19,Unnamed: 18_level_19,Unnamed: 19_level_19
+2.630,علوي,,,,,,,,,,,,,,,,,,
+2.539,ملهي,,,,,,,,,,,,,,,,,,
+2.418,فرقه,,,,,,,,,,,,,,,,,,
+2.357,هويدا,,,,,,,,,,,,,,,,,,
+2.233,احمد علوي,,,,,,,,,,,,,,,,,,
+2.162,بلبله,,,,,,,,,,,,,,,,,,
+1.995,عصفور,,,,,,,,,,,,,,,,,,
+1.979,راقص,,,,,,,,,,,,,,,,,,
+1.956,مطرب,,,,,,,,,,,,,,,,,,
+1.927,مشمش,,,,,,,,,,,,,,,,,,

Weight?,Feature
+2.630,علوي
+2.539,ملهي
+2.418,فرقه
+2.357,هويدا
+2.233,احمد علوي
+2.162,بلبله
+1.995,عصفور
+1.979,راقص
+1.956,مطرب
+1.927,مشمش

Weight?,Feature
+2.495,قبيل
+2.479,رسول
+2.257,نبي
+2.243,مسلم
+2.161,اسلام
+2.146,عزم باشا
+2.128,بلال
+2.125,شعب
+1.907,ثاءر
+1.749,عزم

Weight?,Feature
+2.702,اسماعيل
+2.417,تامين
+1.990,شاب صعيد
+1.975,محتال
+1.834,فاطمه
+1.418,صعيد
+1.373,احيي
+1.368,منعم اسماعيل
+1.368,اسماعيل اسماعيل
+1.335,معلم عباس

Weight?,Feature
+2.319,جندي
+2.027,مقاومه
+1.824,زيد
+1.824,ابو زيد
+1.650,مغربي
+1.570,رابطه
+1.544,جاسوس
+1.471,عدوان
+1.364,سلمي
+1.345,شعب

Weight?,Feature
+2.986,سجن
+2.623,عصابه
+2.321,طفل
+2.122,سليم
+2.055,حموده
+1.794,زبيده
+1.785,وحش
+1.782,طارق
+1.704,رضوان
+1.635,قبيل

Weight?,Feature
+1.044,مقتل روح
+1.044,سبيل قتل
+1.044,ساعد عثور
+1.044,كيتي راقص
+1.044,ٱختيار اسماعيل
+1.044,خاص تسديد
+1.044,حاكي
+1.044,ليلي حاكي
+1.044,عثور قاتل
+1.044,قصاص وقع

Weight?,Feature
+1.598,تصغير
+0.799,مسكين غني
+0.799,قسط
+0.799,ٱستخدام مهاره
+0.799,فتاه ٱبنه
+0.799,المزكي
+0.799,محتاج نجح
+0.799,محتاج
+0.799,اصبع
+0.799,قسط ضءيل

Weight?,Feature
+4.483,<BIAS>
+3.196,ام
+3.023,حسين
+2.354,اهل
+2.272,زوجه
+2.224,كبير
+2.063,طرد
+2.054,ٱبن
+2.054,حياه
+1.996,مات

Weight?,Feature
+2.103,عصابه
+2.095,تمثال
+2.093,حانوتي
+1.776,ضابط شرطه
+1.269,ضابط
+1.112,عصابه عصابه
+1.112,ٱكتشاف قتل
+1.112,عمل دفن
+1.112,سرق تحفه
+1.112,حانوتي عمل

Weight?,Feature
+4.118,حب
+3.714,ليلي
+3.478,امير
+3.374,نادي
+3.348,امال
+3.142,عادل
+2.988,مشعر
+2.952,<BIAS>
+2.889,سامي
+2.690,فتاه

Weight?,Feature
+2.951,زيزو
+2.040,كره قدم
+2.037,فريق
+1.307,سمير
+1.287,كره
+1.218,قدم
+1.083,محترف معارضه
+1.083,اﻹنتصارات فريق
+1.083,ٱنضم فريق
+1.083,فريق وقع

Weight?,Feature
+2.662,ادهم
+2.130,عنتر
+1.945,صليبي
+1.756,امير
+1.722,قصه حياه
+1.599,ٱحتلال
+1.537,فرنسا
+1.519,تناول
+1.511,حياه
+1.506,مخرج

Weight?,Feature
+1.850,معلم زكي
+1.744,ٱنتقام
+1.598,اخلاص
+1.423,حمدي
+1.383,شقيق
+1.271,كيمياءي
+1.203,حاول غندور
+1.191,معمل
+1.167,زكي
+1.147,عمل

Weight?,Feature
+2.157,جمعيه
+1.609,وطني
+1.219,ناهد
+1.159,مجموع شب
+1.104,عضو
+1.090,عماد
+1.024,ادهم
+0.990,شخص
+0.963,اهم
+0.881,تخيل واجه

Weight?,Feature
+1.504,ام حسن
+1.384,حسن
+0.918,مرسيل خليفه
+0.918,عمل شيوعي
+0.918,شاعر حسن
+0.918,مرسيل
+0.918,ام شهيد
+0.918,عمل عسكري
+0.918,غني مرسيل
+0.918,حسن عبدالل

Weight?,Feature
+4.073,زوجه
+3.134,فندق
+2.763,محل
+2.711,شخص
+2.580,زوج
+2.471,<BIAS>
+2.230,مليوناري
+2.138,سوسو
+2.084,تابوت
+2.030,مدبولي

Weight?,Feature
+3.324,حسام
+3.165,عماشه
+2.236,مهرب
+1.970,صحراء
+1.934,وسط
+1.928,رفعت
+1.907,فواز
+1.904,مغول
+1.790,عثمان
+1.771,عم شريك

Weight?,Feature
+2.220,اسراءيل
+1.748,حرب
+1.579,ٱستعراض
+1.289,لحظه
+1.207,قصير
+1.076,سلاح
+1.006,نشيد وطني
+1.006,نشيد
+1.006,هدوء طفل
+1.006,شخص جنوبي

Weight?,Feature
+3.280,شاكر
+2.880,بوليس
+2.869,ناهد
+2.575,نمر
+2.446,صحيفه
+2.383,دلال
+2.236,حمار
+2.144,نبيل
+2.142,<BIAS>
+2.032,قبض

Weight?,Feature
+2.427,مدح
+1.668,امشير
+0.834,صعيد دعا
+0.834,حكم قضاءي
+0.834,شوقي فتوه
+0.834,شهر تسبب
+0.834,زهور ٱكتشف
+0.834,ثني ود
+0.834,ثري مدح
+0.834,بار ٱمتلك


In [15]:
test_preds = logit.predict(X_test_text)
pd.DataFrame(test_preds, columns=['تصنيف'])


Unnamed: 0,تصنيف
0,رومانسي
1,دراما
2,دراما
3,كوميدي
4,دراما
...,...
350,رومانسي
351,دراما
352,كوميدي
353,كوميدي


In [17]:
count=0
a=list(test_preds)
b=list(test["تصنيف الفيلم"])
for i in range(len(test)):
    if a[i]==b[i]:
        count+=1
print("Accuracy Precentage ="+str(round((count/len(test))*100,3)))

Accuracy Precentage =84.225
