## Machine Learning text classification

In [None]:
import pandas as pd

# Read file
sampleset = pd.read_csv('fulltrainingset.csv', encoding='ansi')

# Divide into 80 training and 20 testing
trainingset = sampleset[100:]
testingset = sampleset[:100]

### Multinomial Naive Bayes

In [None]:
# Load packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, cross_val_predict, ShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
import sklearn
import numpy as np

# Set up CountVectorizer with n-grams in range 1-3 to obtain features
vectorizer = CountVectorizer(ngram_range=(1, 3))
train_features = vectorizer.fit_transform(trainingset.preprocessed_text)
test_features = vectorizer.transform(testingset.preprocessed_text.values.astype('str'))

# Fit Multinomial Naive Bayes
nb = MultinomialNB()
model = nb.fit(train_features, trainingset.sentiment)

# Now we can use the model to predict classifications for our test features.
predictions = model.predict(test_features)
accuracy = accuracy_score(testingset.sentiment, predictions)
print('A Multinomial NB with training 80 and test 20 obtains an accuracy of ' + str(accuracy))

# Apply 3, 6 and 10-fold cross validation for Multinomial Naive Bayes
train_features2 = vectorizer.fit_transform(sampleset.preprocessed_text)
NBpredictions3 = cross_val_score(model, train_features2, sampleset.sentiment, cv=3)
NBpredictions5 = cross_val_score(model, train_features2, sampleset.sentiment, cv=5)
NBpredictions10 = cross_val_score(model, train_features2, sampleset.sentiment, cv=10)
print('A Multinomial NB with training 80 and test 20 and 3-fold cross validation obtains an accuracy of ' + str(NBpredictions3.mean()))
print('A Multinomial NB with training 80 and test 20 and 5-fold cross validation obtains an accuracy of ' + str(NBpredictions5.mean()))
print('A Multinomial NB with training 80 and test 20 and 10-fold cross validation obtains an accuracy of ' + str(NBpredictions10.mean()))

# Apply ShuffleSplit to use other cross validation strategies by passing a cross validation iterator instead
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
NBcvscore = cross_val_score(model, train_features2, sampleset.sentiment, cv=cv)
NBcvscore = NBcvscore.mean()
print('A Multinomial NB with training 70 and test 30 and ShuffleSplit obtains an accuracy of ' + str(NBcvscore))

# Print prediction outcomes
pos = 0
neut = 0
neg = 0
for i in range(len(predictions)):
    if predictions[i] == 0:
        neut += 1
    if predictions[i] == 1:
        pos += 1
    if predictions[i] == -1:
        neg += 1
        
print("The number of predictions in the test set: " + str(len(predictions)))
print("The number of positive predictions: " + str(pos))
print("The number of neutral predictions: " + str(neut))
print("The number of negative predictions: " + str(neg))

### SVM with linear kernel

In [None]:
# Load packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, cross_val_predict, ShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn import svm
import sklearn
import numpy as np

# Model uses 80 training and 20 testing

# Fit Linear SVM
clf = svm.SVC(kernel='linear', C=1)
clfmodel = clf.fit(train_features, trainingset.sentiment)

# Predict test features 
svmpredictions = clfmodel.predict(test_features)
svmaccuracy = accuracy_score(testingset.sentiment, svmpredictions)
print('A Linear SVM with training 80 and test 20 obtains an accuracy of ' + str(svmaccuracy))

# Apply 3, 5 and 10-fold cross validation for a Linear Support Vector Machine
train_features3 = vectorizer.fit_transform(sampleset.preprocessed_text)
SVMpredictions3 = cross_val_score(clf, train_features3, sampleset.sentiment, cv=3)
SVMpredictions5 = cross_val_score(clf, train_features3, sampleset.sentiment, cv=5)
SVMpredictions10 = cross_val_score(clf, train_features3, sampleset.sentiment, cv=10)
print('A Linear SVM with training 80 and test 20 and 3-fold cross validation obtains an accuracy of ' + str(SVMpredictions3.mean()))
print('A Linear SVM with training 80 and test 20 and 5-fold cross validation obtains an accuracy of ' + str(SVMpredictions5.mean()))
print('A Linear SVM with training 80 and test 20 and 10-fold cross validation obtains an accuracy of ' + str(SVMpredictions10.mean()))

# Apply ShuffleSplit to use other cross validation strategies by passing a cross validation iterator instead
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
svmcvscore = cross_val_score(clf, train_features3, sampleset.sentiment, cv=cv)
svmcvscore = svmcvscore.mean()
print('A SVM with training 70 and test 30 and ShuffleSplit obtains an accuracy of ' + str(svmcvscore))

# Print prediction outcomes
pos = 0
neut = 0
neg = 0
for i in range(len(svmpredictions)):
    if svmpredictions[i] == 0:
        neut += 1
    if svmpredictions[i] == 1:
        pos += 1
    if svmpredictions[i] == -1:
        neg += 1
        
print("The number of predictions in the test set: " + str(len(svmpredictions)))
print("The number of positive predictions: " + str(pos))
print("The number of neutral predictions: " + str(neut))
print("The number of negative predictions: " + str(neg))

### Random Forest

In [None]:
# Load packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, cross_val_predict, ShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn import svm
import sklearn
import numpy as np

# Model uses 80 training and 20 testing

# Fit Random Forest 
rf = RandomForestClassifier()
rfmodel = rf.fit(train_features, trainingset.sentiment)

# Predict test features 
rfpredictions = rfmodel.predict(test_features)
rfaccuracy = accuracy_score(testingset.sentiment, rfpredictions)
print('A Random Forest with training 80 and test 20 obtains an accuracy of ' + str(rfaccuracy))

# Apply 3, 5 and 10-fold cross validation for a Random Forest
train_features4 = vectorizer.fit_transform(sampleset.preprocessed_text)
rfpredictions3 = cross_val_score(rf, train_features4, sampleset.sentiment, cv=3)
rfpredictions5 = cross_val_score(rf, train_features4, sampleset.sentiment, cv=5)
rfpredictions10 = cross_val_score(rf, train_features4, sampleset.sentiment, cv=10)
print('A Random Forest with training 80 and test 20 and 3-fold cross validation obtains an accuracy of ' + str(rfpredictions3.mean()))
print('A Random Forest with training 80 and test 20 and 5-fold cross validation obtains an accuracy of ' + str(rfpredictions5.mean()))
print('A Random Forest with training 80 and test 20 and 10-fold cross validation obtains an accuracy of ' + str(rfpredictions10.mean()))

# Apply ShuffleSplit to use other cross validation strategies by passing a cross validation iterator instead
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
rfcvscore = cross_val_score(rf, train_features4, sampleset.sentiment, cv=cv)
rfcvscore = rfcvscore.mean()
print('A Random Forest with training 70 and test 30 and ShuffleSplit obtains an accuracy of ' + str(rfcvscore))

# Print prediction outcomes
pos = 0
neut = 0
neg = 0
for i in range(len(rfpredictions)):
    if rfpredictions[i] == 0:
        neut += 1
    if rfpredictions[i] == 1:
        pos += 1
    if rfpredictions[i] == -1:
        neg += 1
        
print("The number of predictions in the test set: " + str(len(rfpredictions)))
print("The number of positive predictions: " + str(pos))
print("The number of neutral predictions: " + str(neut))
print("The number of negative predictions: " + str(neg))

### Logistic Regression with TF-IDF vectorizer

In [None]:
# Load packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import numpy as np

# Fit Logistic Regression and TF-IDF vectorizer
lr = LogisticRegression()
tvec = TfidfVectorizer(ngram_range=(1, 3))
train_features = tvec.fit_transform(trainingset.preprocessed_text)
test_features = tvec.transform(testingset.preprocessed_text.values.astype('str'))
model = lr.fit(train_features, trainingset.sentiment)

# Now we can use the model to predict classifications for our test features.
lrpredictions = model.predict(test_features)
lraccuracy = accuracy_score(testingset.sentiment, lrpredictions)
print('A Logistic Regression with TFIDF and a training 80 and test 20 obtains an accuracy of ' + str(lraccuracy))

# Apply 3, 5 and 10-fold cross validation for a Random Forest
train_features5 = vectorizer.fit_transform(sampleset.preprocessed_text)
lrpredictions3 = cross_val_score(lr, train_features5, sampleset.sentiment, cv=3)
lrpredictions5 = cross_val_score(lr, train_features5, sampleset.sentiment, cv=5)
lrpredictions10 = cross_val_score(lr, train_features5, sampleset.sentiment, cv=10)
print('A Logistic Regression with training 80 and test 20 and 3-fold cross validation obtains an accuracy of ' + str(lrpredictions3.mean()))
print('A Logistic Regression with training 80 and test 20 and 5-fold cross validation obtains an accuracy of ' + str(lrpredictions5.mean()))
print('A Logistic Regression with training 80 and test 20 and 10-fold cross validation obtains an accuracy of ' + str(lrpredictions10.mean()))

# Apply ShuffleSplit to use other cross validation strategies by passing a cross validation iterator instead
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
lrcvscore = cross_val_score(lr, train_features5, sampleset.sentiment, cv=cv)
lrcvscore = lrcvscore.mean()
print('A Logistic Regression with training 70 and test 30 and ShuffleSplit obtains an accuracy of ' + str(lrcvscore))

# Print prediction outcomes
pos = 0
neut = 0
neg = 0
for i in range(len(lrpredictions)):
    if lrpredictions[i] == 0:
        neut += 1
    if lrpredictions[i] == 1:
        pos += 1
    if lrpredictions[i] == -1:
        neg += 1
        
print("The number of predictions in the test set: " + str(len(predictions)))
print("The number of positive predictions: " + str(pos))
print("The number of neutral predictions: " + str(neut))
print("The number of negative predictions: " + str(neg))