In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from rule_based import total_score

# data

In [63]:
tf_idf = pd.read_csv("tf_idf_padded.csv", index_col = 0)
answers = pd.read_csv('answers_clean.csv', index_col = 0)

In [64]:
all_data = pd.merge(answers, tf_idf, left_index=True, right_index=True)
all_data.tail()

Unnamed: 0,sentense,class,0,1,2,3,4,5,6,7,...,17,18,19,20,21,22,23,24,25,26
1001,روز معلم چه تاریخی است,4,0.728783,2.07353,1.537051,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1002,تاریخ تولدت شما چه روزی است,4,1.259414,1.655942,1.10094,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1003,فردا چندم است,4,1.284966,2.040262,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1004,فردا چه مناسبتی هست,4,1.284966,1.959003,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1005,امروز چه روزیه,4,5.815125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
pca = PCA(n_components=6)

In [66]:
pca_tf_idf = pca.fit_transform(tf_idf)

In [67]:
kbest_tf_idf = SelectKBest(chi2, 
                           k=20).fit_transform(tf_idf, answers['class'])

# clustering

## kmeans

In [68]:
number_of_clusters = 4

In [69]:
kmeans = KMeans(n_clusters=number_of_clusters, random_state=0).fit(tf_idf)

In [70]:
y_pred = kmeans.labels_

In [71]:
def measure (y_pred, y_test):
    true = 0 
    false = 0
    for i, y in enumerate(y_pred) :
        if y == y_test[i] :
            true += 1 
        else :
            false +=1
    return true / (true + false)

In [72]:
all_data['cluster'] = y_pred

In [73]:
def percenetages (classes):
    one = 0
    two = 0
    three = 0
    four = 0
    for cl in classes : 
        if cl == 1 :
            one += 1
        elif cl == 2 : 
            two += 1
        elif cl == 3 :
            three += 1
        elif cl == 4 :
            four += 1
    print(one, two, three, four)

In [75]:
for i in range(number_of_clusters) : 
    cluster = i
    sents = []
    classes = []
    for index, row in all_data.iterrows() : 
        if row['cluster'] == cluster : 
            classes.append(row['class'])
            sents.append(row['sentense'])
    print(i)
    percenetages(classes)

0
159 157 141 112
1
76 50 79 89
2
3 1 4 15
3
30 20 22 48


# classification

## naive

In [85]:
gnb = GaussianNB()

In [86]:
X_train, X_test, y_train, y_test = train_test_split(pca_tf_idf,  
                                                    answers['class'], 
                                                    test_size=0.2, 
                                                    random_state=42)

In [87]:
y_pred = gnb.fit(X_train, y_train).predict(X_test)

In [88]:
print("Number of mislabeled points out of a total %d points : %d"% (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 202 points : 119


118 mislabeled with pca n_components = 6

## svm

In [89]:
X_train, X_test, y_train, y_test = train_test_split(pca_tf_idf,  
                                                    answers['class'], 
                                                    test_size=0.2, 
                                                    random_state=42)

In [90]:
svm = SVC()

In [91]:
y_pred = svm.fit(X_train, y_train).predict(X_test)

In [92]:
print("Number of mislabeled points out of a total %d points : %d"% (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 202 points : 116


In [93]:
tuned_parameters = {'kernel': ['rbf', 'linear'], 
                     'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]}

In [94]:
clf = GridSearchCV(SVC(), tuned_parameters, scoring='accuracy')
clf.fit(X_train, y_train)

GridSearchCV(estimator=SVC(),
             param_grid={'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                         'kernel': ['rbf', 'linear']},
             scoring='accuracy')

In [95]:
clf.best_params_

{'C': 1, 'gamma': 0.001, 'kernel': 'linear'}

In [96]:
y_pred = clf.predict(X_test)

In [97]:
print("Number of mislabeled points out of a total %d points : %d"% (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 202 points : 130


pca with n_component = 6 115 mislabeled. grid search made it worse :D

## decision tree

In [98]:
decision_tree = DecisionTreeClassifier(random_state=0)

In [99]:
X_train, X_test, y_train, y_test = train_test_split(tf_idf,  
                                                    answers['class'], 
                                                    test_size=0.2, 
                                                    random_state=42)

In [100]:
y_pred = decision_tree.fit(X_train, y_train).predict(X_test)

In [101]:
print("Number of mislabeled points out of a total %d points : %d"% (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 202 points : 75


In [102]:
criterion = ['gini', 'entropy']
max_depth = [2,4,6,8,10,12]

In [103]:
parameters = dict(criterion=criterion,
                  max_depth=max_depth)

In [104]:
clf = DecisionTreeClassifier(random_state=0)
decision_tree_GS = GridSearchCV(clf, parameters)
y_pred = decision_tree_GS.fit(X_train, y_train).predict(X_test)

In [105]:
print("Number of mislabeled points out of a total %d points : %d"% (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 202 points : 77


75 mislabeled without pca. without pca (GS) = 77

## logostic regression

In [106]:
X_train, X_test, y_train, y_test = train_test_split(pca_tf_idf,  
                                                    answers['class'], 
                                                    test_size=0.2, 
                                                    random_state=42)

In [107]:
logistic_regression = LogisticRegression(random_state=0).fit(X_train, y_train)

In [108]:
y_pred = logistic_regression.predict(X_test) 

In [109]:
print("Number of mislabeled points out of a total %d points : %d"% (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 202 points : 129


In [110]:
param_grid = {'penalty' : ['l1', 'l2'],
             'C' : np.logspace(-4, 4, 20),
             'solver' : ['liblinear']}

In [111]:
logistic_regression_GS = GridSearchCV(LogisticRegression(), 
                                      param_grid = param_grid)

In [112]:
y_pred = logistic_regression_GS.fit(X_train, y_train).predict(X_test)

In [113]:
print("Number of mislabeled points out of a total %d points : %d"% (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 202 points : 132


n_components = 6 and the result is 125 mislabeled. 

# ensemble

### random forest

In [114]:
random_forest = RandomForestClassifier(n_estimators=13)

In [115]:
X_train, X_test, y_train, y_test = train_test_split(tf_idf,  
                                                    answers['class'], 
                                                    test_size=0.2, 
                                                    random_state=42)

In [116]:
y_pred = random_forest.fit(X_train, y_train).predict(X_test)

In [117]:
print("Number of mislabeled points out of a total %d points : %d"% (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 202 points : 71


### gradient boosting

In [118]:
gradient_boosting = GradientBoostingClassifier(random_state=0)

In [119]:
X_train, X_test, y_train, y_test = train_test_split(tf_idf,  
                                                    answers['class'], 
                                                    test_size=0.2, 
                                                    random_state=42)

In [120]:
y_pred = gradient_boosting.fit(X_train, y_train).predict(X_test)

In [121]:
print("Number of mislabeled points out of a total %d points : %d"% (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 202 points : 66


witouht pca 65 mislabeled points 

# weighted voting

In [143]:
classifers_scores = [3, 2, 0, 0]
classifiers = [gradient_boosting, random_forest, svm, gnb]

In [144]:
X_train, X_test, y_train, y_test = train_test_split(tf_idf,  
                                                    answers['class'], 
                                                    test_size=0.2, 
                                                    random_state=42)

In [145]:
def predict (record) : 
    # the input is tf_idf
    global classifers_scores
    record = np.array(record).reshape(1, 27)
    scores = {}
    for i, classifier in enumerate(classifiers) : 
        if i >= 2 : 
            y_pred = classifier.predict(pca.transform(record))
        else :
            y_pred = classifier.predict(record)
            
        y_pred = y_pred[0]
        if y_pred in scores.keys() : 
            scores[y_pred] += y_pred * classifers_scores[i]
        else :
            scores[y_pred] = y_pred * classifers_scores[i]
        
    return scores

In [146]:
y_pred = []
def predict_for_all () : 
    for index, record in X_test.iterrows(): 
        scores = predict(record)
        y_pred.append(max(scores, key=scores.get))
        
    return np.array(y_pred)

In [147]:
y_pred = predict_for_all()

In [148]:
print("Number of mislabeled points out of a total %d points : %d"% (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 202 points : 66
