In [312]:
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier, RidgeClassifier
from sklearn import metrics
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn import model_selection

In [313]:
def basic_tokenize(tweet):
    return tweet.split(' ')

def skipgram_tokenize(tweet, n=None, k=None, include_all=True):
    from nltk.util import skipgrams
    tokens = [w for w in basic_tokenize(tweet)]
    if include_all:
        result = []
        for i in range(k+1):
            skg = [w for w in skipgrams(tokens, n, i)]
            result = result+skg
    else:
        result = [w for w in skipgrams(tokens, n, k)]
    result=set(result)
    #print(result)
    return result

def make_skip_tokenize(n, k, include_all=True):
    return lambda tweet: skipgram_tokenize(tweet, n=n, k=k, include_all=include_all)


In [334]:
def build_estimators():
    estimators = []
    sgd = SGDClassifier(alpha=0.00001, max_iter=50,penalty="l2") 
    estimators.append(('sgd', sgd))
    svc = LinearSVC(penalty='l2', dual=False,tol=1e-3)
    estimators.append(('svc',svc))
    mnb= MultinomialNB(alpha=.01)
    estimators.append(('mnb',mnb))
    bnb= BernoulliNB(alpha=.01)
    estimators.append(('bnb',bnb))
    ensemble = VotingClassifier(estimators)
    return ensemble

In [329]:
train_file = '../data/D6_26/6dialects/train_small'  #'../data/Dialect6/Multi_data/train/post_clean'#  
#test_file = '../data/Dialect6/Multi_data/dev/post_clean'
#test_file = '../data/Dialect26/Multi_data/dev/post_clean'
test_file = '../data/D6_26/6dialects/dev'

print("Loading MADAR dataset for categories:")
data_train = load_files(train_file, encoding = 'utf-8',decode_error='ignore')
data_test = load_files(test_file, encoding = 'utf-8',decode_error='ignore')
y_train = data_train.target
y_test = data_test.target
print(data_train.target_names)
target_names = data_train.target_names
print("Traing Data:   {0}".format(len(data_train.data)))
print("Testing Data:   {0}".format(len(data_test.data)))
print("%d categories" % len(target_names))
print()


Loading MADAR dataset for categories:
['BEI', 'CAI', 'DOH', 'MSA', 'RAB', 'TUN']
Traing Data:   41606
Testing Data:   5007
6 categories



In [330]:
print("Extracting features from the training data using a sparse vectorizer")

union = FeatureUnion([("w_v", TfidfVectorizer(sublinear_tf=True, max_df=0.5,analyzer = 'word', ngram_range=(1,2))),
                       ("c_wb", TfidfVectorizer(sublinear_tf=True, max_df=0.5,analyzer = 'char', ngram_range=(2,5))),
                        ("sk",TfidfVectorizer(sublinear_tf=True, max_df=0.5,tokenizer=make_skip_tokenize(n=2, k=2)))
        ],
transformer_weights={
            'w_v': 0.5,
            'c_wb': 0.5,
            'sk': 0.4,
        }
,
)
X_train = union.fit_transform(data_train.data)
X_test = union.transform(data_test.data)
print("Combined space has", X_train.shape[1], "features")

ensemble = build_estimators()
ensemble.fit(X_train, y_train)

pred = ensemble.predict(X_test)
#for i in range(0,10):
 #   print(data_train.target_names[pred[i]])

score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

print("classification report:")
print(metrics.classification_report(y_test, pred,target_names=target_names))

print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))

Extracting features from the training data using a sparse vectorizer
Combined space has 599530 features
accuracy:   0.821
classification report:
              precision    recall  f1-score   support

         BEI       0.84      0.87      0.85      1201
         CAI       0.83      0.84      0.84       801
         DOH       0.81      0.78      0.79      1201
         MSA       0.76      0.74      0.75       601
         RAB       0.93      0.85      0.89       601
         TUN       0.76      0.85      0.80       602

   micro avg       0.82      0.82      0.82      5007
   macro avg       0.82      0.82      0.82      5007
weighted avg       0.82      0.82      0.82      5007

confusion matrix:
[[1040   39   72   15    4   31]
 [  47  675   26   22    2   29]
 [  83   54  933   70   12   49]
 [  32   21   78  444   10   16]
 [  14    8   13   22  511   33]
 [  28   13   28   11   13  509]]


In [331]:
train_files = []
test_files = []
data_trains = []
data_tests = []

print(data_train.target_names)
for target in data_train.target_names:
    train_files.append('../data/D6_26/6dialects/splited_train/'+ target)
    test_files.append('../data/D6_26/6dialects/splited_dev/'+target)
    data_trains.append(load_files(train_files[-1], encoding = 'utf-8',decode_error='ignore'))
    data_tests.append(load_files(test_files[-1], encoding = 'utf-8',decode_error='ignore'))

#for i,target in enumerate(data_train.target_names):
 #   print("Traing Data:   {0} {1}".format(len(data_trains[i].data),target))

#for i in range(0,len(data_train.target_names)):
       # print(len(data_trains[i].data))
#print(data_train.target_names[pre]+'\t'+data_test.target_names[target])


['BEI', 'CAI', 'DOH', 'MSA', 'RAB', 'TUN']


In [332]:
def feature_union(wv,cwb,ch,sk,i):
    if i == 0 :
        union = FeatureUnion([("w_v", TfidfVectorizer(sublinear_tf=True, max_df=0.7,analyzer = 'word', ngram_range=(1,4)
                                     )),
                           ("c_wb", TfidfVectorizer(sublinear_tf=True, max_df=0.7,analyzer = 'char_wb', ngram_range=(2,5)
                                     )),
                           ("ch", TfidfVectorizer(sublinear_tf=True, max_df=0.7,analyzer = 'char', ngram_range=(2,4)
                                     )),
          ("sk",TfidfVectorizer(sublinear_tf=True, max_df=0.7,tokenizer=make_skip_tokenize(n=2, k=1)))
                           ],
    transformer_weights={
                'w_v': 0.5,
                'c_wb': cwb,
               ' ch':ch,
                'sk': sk,
            }
    ,
    )
    elif i == 1 : #CAI
        union = FeatureUnion([("w_v", TfidfVectorizer(sublinear_tf=True, max_df=0.5,analyzer = 'word', ngram_range=(1,3)
                                     )),
                           ("c_wb", TfidfVectorizer(sublinear_tf=True, max_df=0.5,analyzer = 'char_wb', ngram_range=(2,5)
                                     )),
                           ("ch", TfidfVectorizer(sublinear_tf=True, max_df=0.5,analyzer = 'char', ngram_range=(2,4)
                                     )),
          ("sk",TfidfVectorizer(sublinear_tf=True, max_df=0.5,tokenizer=make_skip_tokenize(n=2, k=2)))
                           ],
    transformer_weights={
                'w_v': wv,
                'c_wb': cwb,
               ' ch':0.7,
                'sk': sk,
            }
    ,
    )
    elif i == 3 :
        union = FeatureUnion([("w_v", TfidfVectorizer(sublinear_tf=True, max_df=0.5,analyzer = 'word', ngram_range=(1,2)
                                     )),
                           ("c_wb", TfidfVectorizer(sublinear_tf=True, max_df=0.5,analyzer = 'char_wb', ngram_range=(2,5)
                                     )),
                           ("ch", TfidfVectorizer(sublinear_tf=True, max_df=0.5,analyzer = 'char', ngram_range=(2,4)
                                     )),
          ("sk",TfidfVectorizer(sublinear_tf=True, max_df=0.5,tokenizer=make_skip_tokenize(n=2, k=1)))
                           ],
    transformer_weights={
                'w_v': wv,
                'c_wb': cwb,
               ' ch':ch,
                'sk': sk,
            }
    ,
    )
    elif i == 4:
        union = FeatureUnion([("w_v", TfidfVectorizer(sublinear_tf=True, max_df=0.5,analyzer = 'word', ngram_range=(1,3)
                                     )),
                           ("c_wb", TfidfVectorizer(sublinear_tf=True, max_df=0.5,analyzer = 'char_wb', ngram_range=(2,5)
                                     )),
                           ("ch", TfidfVectorizer(sublinear_tf=True, max_df=0.5,analyzer = 'char', ngram_range=(2,6)
                                     )),
          ("sk",TfidfVectorizer(sublinear_tf=True, max_df=0.5,tokenizer=make_skip_tokenize(n=2, k=1)))
                           ],
    transformer_weights={
                'w_v': wv,
                'c_wb': 0.7,
               ' ch':0.7,
                'sk': sk,
            }
    ,
    )
    elif i == 5:
        union = FeatureUnion([("w_v", TfidfVectorizer(sublinear_tf=True, max_df=0.5,analyzer = 'word', ngram_range=(1,2)
                                     )),
                           ("c_wb", TfidfVectorizer(sublinear_tf=True, max_df=0.5,analyzer = 'char_wb', ngram_range=(2,5)
                                     )),
                          # ("ch", TfidfVectorizer(sublinear_tf=True, max_df=0.5,analyzer = 'char', ngram_range=(2,6)
                                   #  )),
          ("sk",TfidfVectorizer(sublinear_tf=True, max_df=0.5,tokenizer=make_skip_tokenize(n=2, k=1)))
                           ],
    transformer_weights={
                'w_v': wv,
                'c_wb': 0.7,
               #' ch':0.2,
                'sk': sk,
            }
    ,
    )
    elif i == 2 :
        union = FeatureUnion([("w_v", TfidfVectorizer(sublinear_tf=True, max_df=0.5,analyzer = 'word', ngram_range=(1,3)
                                     )),
                           ("c_wb", TfidfVectorizer(sublinear_tf=True, max_df=0.5,analyzer = 'char_wb', ngram_range=(2,5)
                                     )),
                           ("ch", TfidfVectorizer(sublinear_tf=True, max_df=0.5,analyzer = 'char', ngram_range=(2,4)
                                     )),
          ("sk",TfidfVectorizer(sublinear_tf=True, max_df=0.5,tokenizer=make_skip_tokenize(n=2, k=1)))
                           ],
    transformer_weights={
                'w_v': 0.7,
                'c_wb': 0.7,
               ' ch':0.7,
                'sk': 0.4,
            }
    ,
    )
    else:
        union = FeatureUnion([("w_v", TfidfVectorizer(sublinear_tf=True, max_df=0.5,analyzer = 'word', ngram_range=(1,4)
                                     )),
                           ("c_wb", TfidfVectorizer(sublinear_tf=True, max_df=0.5,analyzer = 'char_wb', ngram_range=(2,5)
                                     )),
                           ("ch", TfidfVectorizer(sublinear_tf=True, max_df=0.5,analyzer = 'char', ngram_range=(2,4)
                                     )),
          ("sk",TfidfVectorizer(sublinear_tf=True, max_df=0.5,tokenizer=make_skip_tokenize(n=2, k=1)))
                           ],
        transformer_weights={
                'w_v': wv,
                'c_wb': cwb,
               ' ch':ch,
                'sk': sk,
            }
        ,
        )
    
    return union
    

In [335]:
#build 5 models for all sub categories 
#start fine_grained
ensembles,union_fine = [],[]
#data_trains = []
for i in range(0,len(data_train.target_names)):
    #print(data_trains[i].target_names)
    """if i == 3: #there is only one class here MSA.
        union_fine.append(0)
        ensembles.append(0)
        continue"""
    #i=0
    print(data_train.target_names[i])
    y_train_fine = data_trains[i].target
    print(data_trains[i].target_names)
    target_names_fine = data_trains[i].target_names
    print("Traing Data:   {0}".format(len(data_trains[i].data)))
    print("%d categories" % len(target_names_fine))
    union_fine.append(feature_union(0.5,0.5,0.5,0.4,i))
    X_train_fine = union_fine[-1].fit_transform(data_trains[i].data) #union.fit_transform(data_train.data)
    print("Combined space has", X_train_fine.shape[1], "features")
    ensembles.append(build_estimators().fit(X_train_fine, y_train_fine))
    print()
    X_test = union_fine[-1].transform(data_tests[i].data)
    pred = ensembles[-1].predict(X_test)
#for i in range(0,10):
 #   print(data_train.target_names[pred[i]])
    y_test = data_tests[i].target
    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    print("classification report:")
    print(metrics.classification_report(y_test, pred,target_names=data_tests[i].target_names))

    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))
    #break
    
    

BEI
['ALE', 'AMM', 'BEI', 'DAM', 'JER', 'SAL']
Traing Data:   9600
6 categories
Combined space has 200827 features





accuracy:   0.612
classification report:
              precision    recall  f1-score   support

         ALE       0.63      0.71      0.67       200
         AMM       0.57      0.67      0.62       200
         BEI       0.67      0.70      0.69       200
         DAM       0.63      0.56      0.59       200
         JER       0.54      0.61      0.57       200
         SAL       0.65      0.43      0.52       200

   micro avg       0.61      0.61      0.61      1200
   macro avg       0.62      0.61      0.61      1200
weighted avg       0.62      0.61      0.61      1200

confusion matrix:
[[142   8  19  13  12   6]
 [ 13 133   7  11  25  11]
 [ 22   8 140  16  10   4]
 [ 24  26  17 111  11  11]
 [ 13  28  10  12 122  15]
 [ 10  29  15  12  48  86]]
CAI
['ALX', 'ASW', 'CAI', 'KHA']
Traing Data:   6400
4 categories
Combined space has 160722 features





accuracy:   0.635
classification report:
              precision    recall  f1-score   support

         ALX       0.54      0.62      0.58       200
         ASW       0.57      0.61      0.59       200
         CAI       0.57      0.47      0.52       200
         KHA       0.89      0.83      0.86       200

   micro avg       0.64      0.64      0.64       800
   macro avg       0.64      0.64      0.64       800
weighted avg       0.64      0.64      0.64       800

confusion matrix:
[[124  40  30   6]
 [ 47 123  26   4]
 [ 50  44  95  11]
 [  9   8  17 166]]
DOH
['BAG', 'BAS', 'DOH', 'JED', 'MOS', 'RIY']
Traing Data:   9600
6 categories
Combined space has 195983 features





accuracy:   0.740
classification report:
              precision    recall  f1-score   support

         BAG       0.65      0.72      0.69       200
         BAS       0.60      0.68      0.64       200
         DOH       0.76      0.76      0.76       200
         JED       0.80      0.77      0.78       200
         MOS       0.85      0.82      0.84       200
         RIY       0.81      0.69      0.74       200

   micro avg       0.74      0.74      0.74      1200
   macro avg       0.75      0.74      0.74      1200
weighted avg       0.75      0.74      0.74      1200

confusion matrix:
[[145  40   3   5   3   4]
 [ 45 136   8   3   6   2]
 [  6  13 152  12   4  13]
 [  9  12  12 153   3  11]
 [  8  16   8   1 165   2]
 [  9   8  17  17  12 137]]
MSA
['MSA', 'MUS', 'SAN']
Traing Data:   4800
3 categories
Combined space has 123566 features





accuracy:   0.845
classification report:
              precision    recall  f1-score   support

         MSA       0.80      0.94      0.86       200
         MUS       0.84      0.74      0.79       200
         SAN       0.91      0.85      0.88       200

   micro avg       0.84      0.84      0.84       600
   macro avg       0.85      0.85      0.84       600
weighted avg       0.85      0.84      0.84       600

confusion matrix:
[[187  10   3]
 [ 38 149  13]
 [ 10  19 171]]
RAB
['ALG', 'FES', 'RAB']
Traing Data:   4800
3 categories
Combined space has 244041 features





accuracy:   0.817
classification report:
              precision    recall  f1-score   support

         ALG       0.85      0.93      0.89       200
         FES       0.77      0.80      0.78       200
         RAB       0.83      0.72      0.77       200

   micro avg       0.82      0.82      0.82       600
   macro avg       0.82      0.82      0.81       600
weighted avg       0.82      0.82      0.81       600

confusion matrix:
[[186   8   6]
 [ 17 160  23]
 [ 16  40 144]]
TUN
['BEN', 'SFX', 'TRI', 'TUN']
Traing Data:   6400
4 categories
Combined space has 110323 features





accuracy:   0.791
classification report:
              precision    recall  f1-score   support

         BEN       0.78      0.92      0.84       200
         SFX       0.72      0.83      0.77       200
         TRI       0.85      0.76      0.80       200
         TUN       0.85      0.66      0.74       200

   micro avg       0.79      0.79      0.79       800
   macro avg       0.80      0.79      0.79       800
weighted avg       0.80      0.79      0.79       800

confusion matrix:
[[183   3  13   1]
 [  9 166   7  18]
 [ 34  10 152   4]
 [  9  52   7 132]]


In [336]:
gold_test_file =  open('result26/voting_gold_1.txt','w+') 
pred_test_file = open('result26/voting_pred_1.txt','w+')
sample_file = open('result26/voting_test_set_1.txt','w+')
test_file26 = '../data/Dialect26/Multi_data/dev/new_clean'
data_test = load_files(test_file26, encoding = 'utf-8',decode_error='ignore')
X_test = union.transform(data_test.data)
y_test = data_test.target
pred = ensemble.predict(X_test)

In [337]:
c= 0
for target,pre,doc in zip(y_test,pred,data_test.data):
    """if pre == 3:
        gold_test_file.write(data_test.target_names[target]+ '\n')
        pred_test_file.write('MSA\n')
        #sample_file.write(data_test.target_names[target]+'\t'+data_trains[pre].target_names[pred26[0]]+ '\t'+doc+'\n')
        continue"""
    
    X_test = union_fine[pre].transform([doc])
    pred26 = ensembles[pre].predict(X_test)
    
    if pre == 0 and data_test.target_names[target] != data_trains[pre].target_names[pred26[0]]:
        c = c+1
        print(data_test.target_names[target]+'\t'+data_trains[pre].target_names[pred26[0]]+ '\t'+doc+'\n')

    gold_test_file.write(data_test.target_names[target]+ '\n')
    pred_test_file.write(data_trains[pre].target_names[pred26[0]]+ '\n')
    #sample_file.write(data_test.target_names[target]+'\t'+data_trains[pre].target_names[pred26[0]]+ '\t'+doc+'\n')
    
print(c)  

JER	ALE	 بدي ياك تاخدني علي محلات ممتعه انا سايح بتعرف


TUN	BEI	 نسيت مشروباتنا


SAN	DAM	 معي مكينتي


AMM	SAL	 بقدر ادفع عن طريق شيك سياحي


ALE	BEI	 بدي كريم ايد


SAL	JER	 بدي كريم ايدين


RIY	JER	 بنتقاسم


BEI	JER	 خلينا نحضر تلفزيون في مبارات باسكتبول


SAN	AMM	 كم بتكلف الليموزين لوسط المدينه


ALE	JER	 بدي كريم ليلي


ALE	DAM	 بتستقبلني هالليله


CAI	ALE	 تلاتين دولار تمانين سنت


BAS	AMM	 علي اي جهه من يونيون سكوير هو


ALE	AMM	 بيجي هاد مع قسم السلطه واختيار من البطاطا المخبوزه او البطاطا المقليه


SAL	AMM	 انا معك وحده من الاماكن المفضله عندي


MSA	AMM	 دعنا نشاهد التلفزيون فيها مباراه بيسبول


MOS	ALE	 شكرا مرحبا شكرا لانتضارك


JER	BEI	 بدي ضو


KHA	SAL	 من احجز غرفه دبل بتطل علي المحيط


TRI	JER	 همم هدا اللون يطلع حلو عليك


SAL	BEI	 المجموع طابه وحده و عصايتين


AMM	JER	 جلد التمساح


DOH	SAL	 السيفون ما ينسحب


ALE	BEI	 حاس حالي بردان وبطني عم توجعني كتير


BEI	DAM	 شو هالموسيقي


ALG	BEI	 عندي اختين


ALE	JER	 كاميرا الفيديو انسرقت من غرفتي


DOH	BEI	 عندي اختين


J

ALX	DAM	 فنجان قهوه لو سمحت


ALX	AMM	 اطلع من الباب اللي هناك واستني في محطه رقم سته


SAL	JER	 مش عارف قديش اترك بقشيش


MUS	BEI	 عندك واحد غير


JER	AMM	 احنا هون ليوم


TRI	DAM	 عندك منه الوان تانيه


SAL	AMM	 اطلع من هاد الباب هناك و استني عند نقطه التوقف سته


ALE	SAL	 الشارع كان معجء بالسيارات


SAL	JER	 شو الاسرع تكسي و لاسياره اسعاف


DAM	AMM	 لو سمحت بدي استخدم الحمام


SFX	SAL	 ما عندي لا انديه و لا احذيه


KHA	AMM	 رحله رقم اتنين صفر تمنيه الي طوكيو


ASW	ALE	 كم ولايه جنب طوكيو


JER	AMM	 ما بقدر ازبط الصوره


SAN	SAL	 لو سمحت غلفهن منفصلات


DAM	ALE	 بتفضل طاوله بالمطعم الرءيسي ولا بغرفه خاصه استاذ


BEI	ALE	 كل هالمكونات طبيعيه


DAM	BEI	 ما بعرف اديش لازم اترك بخشيش


SAL	BEI	 خلينا نحضر تلفزيون في لعبه بايسبول شغاله


JER	ALE	 تلاتميه وخمسين دولار هدا اكتر من ميزانيتي


RIY	DAM	 طلبته من وقت طويل


JED	JER	 الشارع كان مليان سيارات


JER	AMM	 انا بدي كاسه نبيد


MUS	AMM	 انا رايح بقعد مع صديق


DAM	ALE	 هي هويتي


SAL	AMM	 القطارات وقفت عشان الاضراب


DAM	AMM	 هاد الكرت

SAL	JER	 الشارع كان مليان سيارات


BAG	AMM	 عندي معداتي الخاصه


SAN	DAM	 معي علاجي


SAL	DAM	 في طفع جلدي علي ايدي


SAL	BEI	 ما بقدر انام منيح بالليل


DAM	JER	 ملح لو سمحت


MUS	JER	 ملح لو سمحت


JER	ALE	 هدول مكونات طبيعيه


KHA	JER	 مسجل الفيديو اتسرق من غرفتي


BEI	AMM	 اسمي كيمورا بدي الغي حجزي الجمعه


DAM	AMM	 ممكن تشرب معي قهوه


SAL	JER	 شكلو منيح وين بنقدر نستناك


BEI	JER	 في ناس انجرحو


JER	AMM	 شو اصغر عمر عشان نعمله


ALE	SAL	 لا بدي استاجر نص مجموعه مضارب وحذاء


SAL	BEI	 نسيت مشروباتنا


MOS	BEI	 السياره عتطلع اصوات غريبي


MUS	BEI	 عندي اختين


BAG	JER	 مسجل الفيديو انباك من غرفتي


DAM	SAL	 هاد بيجي مع بوفيه السلطه بالاضافه لتشكيله بطاطا مشويه او مقليه


AMM	JER	 وين بقدر اصف سيارتي


BAS	AMM	 انا اسف راح ارجع بعدين


SAL	AMM	 لو سمحت بدي استخدم الحمام


ALX	JER	 خمسين جنيه


BEN	SAL	 من عشره لعشرين لو سمحت


DAM	ALE	 هدول مكونات طبيعيه


ALE	JER	 بتقدر تعيف غراضي للساعه تلاته


BEI	DAM	 طيب هلا خلينا نشوفك


BEI	JER	 خمسين جنيه


KHA	ALE	 ما مشكله


ASW	ALE	 وانا