In [1]:
import numpy as np
import csv
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, \
    recall_score, confusion_matrix, classification_report, \
    accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler


  from numpy.core.umath_tests import inner1d


In this script, we have three ML model (using 5 fold cross validation):
SVC: C-Support Vector Classification.
SGDClassifier: Linear classifiers (SVM, logistic regression, a.o.) with SGD training.
This estimator implements regularized linear models with stochastic gradient descent (SGD) learning: the gradient of the loss is estimated each sample at a time and the model is updated along the way with a decreasing strength schedule (aka learning rate). SGD allows minibatch (online/out-of-core) learning, see the partial_fit method. For best results using the default learning rate schedule, the data should have zero mean and unit variance.

Random Forest:
RF gives the best result:
f1: 0.95 (+/- 0.08)
precision: 0.8666666666666668
recall: 0.8428571428571429

Rule based: F1: 0.98245614   Recall: 1.0 Precision: 0.8125


In [8]:
class myQuote:
    def __init__(self, text):
        #read in CSV, process
        self.quoteText = text
        self.quoteID = hash(self.quoteText)
        self.link = []
        self.name = []
        self.description = []
        self.scores = []
        self.cos = []
    
    def __hash__(self):
        return self.quoteID
    
    def __str__(self):
        return "Object text: " + self.quoteText + '\n' +\
        "Links: " + str(self.link) + '\n' +\
        "Names: " + str(self.name) + '\n' +\
        "Description " + str(self.description) + '\n' +\
        "Scores: " + str(self.scores)
        

In [11]:
infile = open('searchResults_Multicos','rb')
results = pickle.load(infile, encoding='bytes')

In [12]:
for item in results:
    print(results[item].scores)

[1, 1, 5, 7, 5, 7, 1, 1, 6, 0]
[5, 2, 3, 1, 1, 1, 1, 1, 3, 0]
[5, 7, 3, 5, 9, 9, 5, 1, 1, 0]
[1, 1, 1, 1, 3, 1, 1, 1, 1, 3]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 5, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 3, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]
[5, 5, 5, 5, 1, 1, 1, 1, 1, 0]
[1, 1, 1, 1, 5, 1, 1, 1, 1, 0]
[1, 1, 1, 5, 1, 3, 1, 1, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 2, 2, 1, 0]
[1, 5, 5, 4, 1, 5, 1, 1, 5, 0]
[1, 5, 5, 1, 5, 5, 5, 1, 7, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 3, 1, 3, 1, 7, 1, 1, 1]
[1, 7, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 5, 1, 1, 1, 1, 0]
[5, 1, 5, 1, 1, 5, 1, 3, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]
[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]
[1, 1, 1

[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 5, 1, 1, 5, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 3, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 0]
[4, 1, 5, 5, 1, 1, 2, 1, 5, 0]
[3, 1, 1, 1, 3, 1, 1, 1, 0, 0]
[1, 1, 5, 3, 3, 1, 5, 5, 5, 1]
[1, 1, 1, 1, 1, 1, 3, 1, 1, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 0]
[3, 3, 1, 1, 3, 1, 1, 1, 1, 1]
[3, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 0]
[4, 1, 1, 1, 0, 0, 0, 0, 0, 0]
[1, 1, 3, 1, 1, 1, 1, 1, 4, 5]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 0]
[1, 1, 1, 3, 1, 1, 1, 4, 1, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 9, 1, 1, 1, 3, 1, 3, 1, 0]
[9, 7, 4, 5, 5, 1, 1, 1, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1

[1, 1, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 3, 1, 1, 0, 0, 0, 0, 0]
[1, 1, 5, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 0]
[9, 9, 5, 3, 7, 9, 3, 1, 3, 1]
[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
[4, 1, 1, 1, 1, 1, 1, 1, 1, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]
[5, 3, 3, 6, 1, 5, 1, 1, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 3, 1, 1, 1, 1, 1]
[1, 1, 1, 3, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[2, 5, 7, 5, 4, 1, 3, 1, 0, 0]
[1, 5, 1, 11, 1, 1, 1, 0, 0, 0]
[3, 1, 1, 2, 7, 5, 1, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]
[3, 1, 5, 5, 5, 5, 3, 5, 5, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 0]
[5, 1, 1, 1, 1, 1, 3, 1, 0, 0]
[5, 5, 1, 1, 3, 1, 1, 1, 5, 3]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[5, 9, 7, 5, 1, 9, 6, 3, 0, 0]
[3, 3, 5, 5, 3, 3, 0, 0, 0, 0]
[1, 1, 

In [13]:
labels = pd.read_csv('./LableResults/MLData_recode_mannual.csv')
label = labels['label'].values
label = label.astype(np.float)
print(len(label))
print(type(label))


200
<class 'numpy.ndarray'>


In [29]:
def append_features(ob):
    count = 0
    proto_matrix = []
    for item in ob:
        col2 = np.append(ob[item].scores, ob[item].cos)
       # print(col2)
        proto_matrix.append(col2)
        count += 1
        if count > 199:
            break
      
    return proto_matrix

proto_matrix = append_features(results)
FeatureMatrix = np.matrix(proto_matrix)

In [32]:
FeatureMatrix[0:3]

matrix([[1.        , 1.        , 5.        , 7.        , 5.        ,
         7.        , 1.        , 1.        , 6.        , 0.        ,
         0.81215575, 0.84446287, 0.8832385 , 0.89525542, 0.89731908,
         0.90207933, 0.91034729, 0.90658497, 0.90142585, 0.        ],
        [5.        , 2.        , 3.        , 1.        , 1.        ,
         1.        , 1.        , 1.        , 3.        , 0.        ,
         0.96617609, 0.96843455, 0.97933499, 0.97720837, 0.97909776,
         0.98094155, 0.98114494, 0.97788701, 0.97699932, 0.        ],
        [5.        , 7.        , 3.        , 5.        , 9.        ,
         9.        , 5.        , 1.        , 1.        , 0.        ,
         0.96975383, 0.96801451, 0.95080867, 0.95405933, 0.95963737,
         0.96563402, 0.96883438, 0.97108591, 0.96864645, 0.        ]])

In [31]:
scaler = StandardScaler()
scaled_matrix = scaler.fit_transform(FeatureMatrix)


In [33]:
#data
X = scaled_matrix
y = label

In [34]:
#split data for grish search
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [35]:
#####grid search (the parameters predict everything to one class, we should use a separated 
#sample for tuning parameters, but not enough cases so far)
svc = svm.SVC()
parameters = [{'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': [0.01, 0.001, 0.0001],
                     'C': [0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.3, 1.5, 1.7, 2.0, 3.0], 'class_weight':['balanced']}]
                   
grid_search_item = GridSearchCV(estimator = svc,
                          param_grid = parameters,
                           scoring = 'accuracy',
                           n_jobs = -1)
grid_search = grid_search_item.fit(X_train, y_train)

print(grid_search.best_score_)####not sure how come this is so good,but the parameters don't work good in the model
print(grid_search.best_params_)

means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
params = grid_search.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.9142857142857143
{'C': 1.3, 'class_weight': 'balanced', 'gamma': 0.001, 'kernel': 'rbf'}
0.864286 (0.036720) with: {'C': 0.1, 'class_weight': 'balanced', 'gamma': 0.01, 'kernel': 'linear'}
0.857143 (0.008690) with: {'C': 0.1, 'class_weight': 'balanced', 'gamma': 0.01, 'kernel': 'poly'}
0.900000 (0.052718) with: {'C': 0.1, 'class_weight': 'balanced', 'gamma': 0.01, 'kernel': 'rbf'}
0.900000 (0.049917) with: {'C': 0.1, 'class_weight': 'balanced', 'gamma': 0.01, 'kernel': 'sigmoid'}
0.864286 (0.036720) with: {'C': 0.1, 'class_weight': 'balanced', 'gamma': 0.001, 'kernel': 'linear'}
0.385714 (0.338475) with: {'C': 0.1, 'class_weight': 'balanced', 'gamma': 0.001, 'kernel': 'poly'}
0.150000 (0.001521) with: {'C': 0.1, 'class_weight': 'balanced', 'gamma': 0.001, 'kernel': 'rbf'}
0.407143 (0.369107) with: {'C': 0.1, 'class_weight': 'balanced', 'gamma': 0.001, 'kernel': 'sigmoid'}
0.864286 (0.036720) with: {'C': 0.1, 'class_weight': 'balanced', 'gamma': 0.0001, 'kernel': 'linear'}
0.385714 (0

In [36]:
svc = svm.SVC(gamma=0.001, class_weight='balanced', C = 1.3, kernel = 'rbf')
#svc.fit(X_train, y_train) 

In [37]:
scores = cross_val_score(svc, X, y, cv=5, scoring='f1_weighted')
print(scores)
print("f1: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
precision = cross_val_score(svc, X, y, cv=5, scoring='precision')
recall = cross_val_score(svc, X, y, cv=5, scoring='recall')
print(precision.mean())
print(recall.mean())

[0.81411678 0.88563686 0.89142857 0.92015447 0.97338482]
f1: 0.90 (+/- 0.10)
0.7988888888888889
0.6


####SGDClassifier 

In [38]:
#####grid search (the parameters predict everything to one class, we should use a separated 
#sample for tuning parameters, but not enough cases so far)
svc = SGDClassifier(max_iter= 1000)
parameters = [{'alpha': [0.01, 0.05, 0.001, 0.005], 'class_weight':['balanced']}]
                   
grid_search_item = GridSearchCV(svc,
                          param_grid = parameters,
                           scoring = 'accuracy',
                           n_jobs = -1)
grid_search = grid_search_item.fit(X_train, y_train)

print(grid_search.best_score_)####not sure how come this is so good,but the parameters don't work good in the model
print(grid_search.best_params_)

0.8785714285714286
{'alpha': 0.01, 'class_weight': 'balanced'}


In [39]:
clf = SGDClassifier(max_iter=1000, alpha=0.01, class_weight = 'balanced')
clf.fit(X_train, y_train) 

SGDClassifier(alpha=0.01, average=False, class_weight='balanced', epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=1000, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [538]:
#y = clf.predict(X_test)


In [40]:
scores = cross_val_score(clf, X, y, cv=5, scoring='f1_weighted')
print(scores)
print("f1: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
precision = cross_val_score(clf, X, y, cv=5, scoring='precision')
recall = cross_val_score(clf, X, y, cv=5, scoring='recall')
print(precision.mean())
print(recall.mean())

[0.82470496 0.92872224 0.87022398 0.90315934 0.94871795]
f1: 0.90 (+/- 0.09)
0.6468315018315018
0.7761904761904763


###random forest

In [450]:
rf = RandomForestClassifier()

In [41]:
#####grid search (the parameters predict everything to one class, we should use a separated 
#sample for tuning parameters, but not enough cases so far)

parameters = [{'max_features':['auto','sqrt','log2'], 'class_weight':['balanced'], 
               'max_leaf_nodes':[10,50,100], 'max_depth':[2,5,10,20], 'n_estimators' : [50,100,200,300,400]}]
                   
grid_search_item = GridSearchCV(rf,
                          param_grid = parameters,
                           scoring = 'accuracy',
                           n_jobs = -1)
grid_search = grid_search_item.fit(X_train, y_train)

print(grid_search.best_score_)####not sure how come this is so good,but the parameters don't work good in the model
print(grid_search.best_params_)

means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
params = grid_search.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.9285714285714286
{'class_weight': 'balanced', 'max_depth': 5, 'max_features': 'auto', 'max_leaf_nodes': 10, 'n_estimators': 50}
0.914286 (0.016780) with: {'class_weight': 'balanced', 'max_depth': 2, 'max_features': 'auto', 'max_leaf_nodes': 10, 'n_estimators': 50}
0.914286 (0.016780) with: {'class_weight': 'balanced', 'max_depth': 2, 'max_features': 'auto', 'max_leaf_nodes': 10, 'n_estimators': 100}
0.914286 (0.016780) with: {'class_weight': 'balanced', 'max_depth': 2, 'max_features': 'auto', 'max_leaf_nodes': 10, 'n_estimators': 200}
0.914286 (0.016780) with: {'class_weight': 'balanced', 'max_depth': 2, 'max_features': 'auto', 'max_leaf_nodes': 10, 'n_estimators': 300}
0.914286 (0.016780) with: {'class_weight': 'balanced', 'max_depth': 2, 'max_features': 'auto', 'max_leaf_nodes': 10, 'n_estimators': 400}
0.914286 (0.016780) with: {'class_weight': 'balanced', 'max_depth': 2, 'max_features': 'auto', 'max_leaf_nodes': 50, 'n_estimators': 50}
0.914286 (0.016780) with: {'class_weight': '

In [42]:
rf = RandomForestClassifier(n_estimators=100, max_depth=2, class_weight = 'balanced',
                             random_state=0, max_leaf_nodes = 10, max_features = 'auto')

In [43]:
scores = cross_val_score(rf, X, y, cv=5, scoring='f1_weighted')
print(scores)
print("f1: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
precision = cross_val_score(rf, X, y, cv=5, scoring='precision')
recall = cross_val_score(rf, X, y, cv=5, scoring='recall')
print(precision.mean())
print(recall.mean())

[0.97484841 0.93159695 0.89142857 0.88156288 0.94434389]
f1: 0.92 (+/- 0.07)
0.8011111111111111
0.7714285714285715


#Rule based classifier

In [332]:
rule = pd.read_csv('./LableResults/EvaluateSample3_fscore.csv')
rule[0:5]

Unnamed: 0,hash,text,count,cosineSim,label,manual
0,-4.24e+18,VIRUS SPREADING LIKE WILDFIRE ON FB. ITS A TRO...,"[0, 0, 0, 0, 0, 0, 0, 0, 0]",0.956189,0,1
1,7.57e+18,They say true friends go long periods of time ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0]",0.92427,0,1
2,3.58e+18,Going to church doesn't make you a holy person...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.936195,0,1
3,-1.99e+18,From Lady Gaga's Bad Romance: /I want your psy...,"[4, 0, 0]",0.932406,0,1
4,2.91e+18,The first woman was created from the rib of a ...,"[4, 1, 2, 0, 0, 0, 0, 0, 2]",0.976999,1,1


In [333]:
y_pred = rule['label']
y = rule['manual']

In [339]:
print('F1:', f1_score(y_pred, y, average= None))
cm = confusion_matrix(y_pred, y)
print('confusion Matrix:', cm)
print ('Recall:', recall_score(y_pred, y))
print ('Precision:', precision_score(y_pred, y))

F1: [0.98245614 0.89655172]
confusion Matrix: [[168   6]
 [  0  26]]
Recall: 1.0
Precision: 0.8125
