In [None]:
!pip install spaCy

In [None]:
!python -m spacy download en 


spacy.load('en') doesn't work. import the model instead, and use the 'model'.load

In [1]:
# load the data, clean-up texts, generate x = text, y = label dataset
import numpy
import re
import spacy
import en_core_web_sm
import en_vectors_glove_md


def cleanup_text(doc_text):
    #doc_text = re.sub("[^a-zA-Z]"," ", doc_text)    
    doc_text = re.sub(" http[.s]*\:.* ", " url ", doc_text)
    return doc_text

# load training data
train_comments = []
train_labels = []
with open('train_comments.csv', 'rb') as f:
    for line in f:
        sep_loc = line.rfind(',')
        train_comments.append(cleanup_text(line[:sep_loc].decode('utf-8')))
        train_labels.append(int(line[sep_loc+1:]))
print ('there are %d comments in training set ' % len(train_comments))
#load test data
test_comments = []
test_labels = []
with open('test_comments.csv', 'rb') as f:
    for line in f:
        sep_loc = line.rfind(',')
        test_comments.append(cleanup_text(line[:sep_loc].decode('utf-8')))
        test_labels.append(int(line[sep_loc+1:]))
print ('there are %d comments in testing set ' % len(test_comments))



there are 48448 comments in training set 
there are 12113 comments in testing set 


In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.externals import joblib
# try two types of data representation, bag of words and tf-idf
# try with three classifiers: logistic regression, linear svm, random forest
# do a grid search to do model selection

nlp = en_vectors_glove_md.load()
def word2vector(list_doc):
    w2v_feat = []
    for i in range(len(list_doc)):
        vector = nlp(list_doc[i]).vector
        w2v_feat.append(vector)
    return w2v_feat
    
vectorizers = [CountVectorizer, TfidfVectorizer, word2vector]
classifiers = [LogisticRegression, LinearSVC] # RandomForestClassifier
param_grid = {LogisticRegression: {'penalty':['l1', 'l2'],
                                   'C':[0.001, 0.01, 0.1, 1, 10]},
              LinearSVC: {'C':[0.001,0.01,0.1,1,10]}}
# train_comments = train_comments[:1000]
# train_labels = train_labels[:1000]
filename = ['bagWord_LR', 'tfidf_LR', 'wordVector_LR', 'bagWord_SVC', 'tfidf_SVC', 'wordVector_SVC']

counter = 0
for vect in vectorizers:
    print('Extracting features with', vect)
    if vect == word2vector:
        train_feat = word2vector(train_comments)
        test_feat = word2vector(test_comments)

    else:
        feat_ext = vect(ngram_range=(1,3), # use up to 3-gram
                    max_df=0.7 # use max df to filter stop words
                    )
        train_feat = feat_ext.fit_transform(train_comments) # feat_ext.fit(train_comments) feat_ext.transform(train_comments)
        test_feat = feat_ext.transform(test_comments)

    for classifier_type in classifiers:

        print('Training {}'.format(classifier_type))
        model = classifier_type()

        model_selector = GridSearchCV(model,
                                      param_grid[classifier_type],
                                      n_jobs=4,
                                      cv=3)
        model_selector.fit(train_feat, train_labels)
        print('Best classification acc: {}'.format(model_selector.best_score_))

        test_acc = model_selector.score(test_feat, test_labels)
        print('Test classification acc: {}'.format(test_acc))
        joblib.dump(model_selector, '{}.pkl'.format(filename[counter])) 
        prediction = model_selector.predict(test_feat)
        joblib.dump(prediction, '{}_prediction.pkl'.format(filename[counter]))
        counter+=1

# just for randomForest
RF_param_grid = {'n_estimators':[100], # fix to 100 trees
                'criterion':['gini', 'entropy'],
                'min_samples_split':[2, 4, 10],
                'n_jobs':[-1] # multi process
                                      }
print('Extracting features with', word2vector)
train_feat = word2vector(train_comments)
test_feat = word2vector(test_comments)
print('Training {}'.format(RandomForestClassifier))
model = RandomForestClassifier()
model_selector = GridSearchCV(model,
                              RF_param_grid,
                              n_jobs=4,
                              cv=3)
model_selector.fit(train_feat, train_labels)
print('Best classification acc: {}'.format(model_selector.best_score_))

test_acc = model_selector.score(test_feat, test_labels)
print('Test classification acc: {}'.format(test_acc))

joblib.dump(model_selector, 'wordVector_RF.pkl')
prediction = model_selector.predict(test_feat)
joblib.dump(prediction, 'wordVector_RF_prediction.pkl')

('Extracting features with', <class 'sklearn.feature_extraction.text.CountVectorizer'>)
Training <class 'sklearn.linear_model.logistic.LogisticRegression'>
Best classification acc: 0.683433784676
Test classification acc: 0.697349954594
Training <class 'sklearn.svm.classes.LinearSVC'>
Best classification acc: 0.678191050198
Test classification acc: 0.695038388508
('Extracting features with', <class 'sklearn.feature_extraction.text.TfidfVectorizer'>)
Training <class 'sklearn.linear_model.logistic.LogisticRegression'>
Best classification acc: 0.684940554822
Test classification acc: 0.68356311401
Training <class 'sklearn.svm.classes.LinearSVC'>
Best classification acc: 0.685497853369
Test classification acc: 0.704614876579
('Extracting features with', <function word2vector at 0x000000000EABCBA8>)
Training <class 'sklearn.linear_model.logistic.LogisticRegression'>
Best classification acc: 0.673134081902
Test classification acc: 0.685709568232
Training <class 'sklearn.svm.classes.LinearSVC'>

['wordVector_RF_prediction.pkl']

In [None]:
#check if the random-split for training set contains the same proportion of every category
import numpy as np
import matplotlib.pyplot as plt

comments = []
labels = []
with open('comments.csv', 'rb') as f:
    for line in f:
        sep_loc = line.rfind(',')
        comments.append(cleanup_text(line[:sep_loc].decode('utf-8')))
        labels.append(int(line[sep_loc+1:]))
        
plt.hist(train_labels, bins='auto')  
plt.show()
print (np.histogram(train_labels, normed = True, bins = 9))
        
plt.hist(labels, bins='auto')  
plt.show()
print (np.histogram(labels, normed = True, bins = 9))