In [None]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import nltk
import re

In [None]:
# load data
data = pd.read_csv('posts/shuffled_posts.csv', encoding = 'latin1')
relevant_ix = [] # non-spam indices
all_ix = [] # includes spam
for index, row in data.iterrows():
    if row['is_relevant'] == 1:
        relevant_ix.append(index)
        all_ix.append(index)
    if row['is_relevant'] == 0:
        all_ix.append(index)
data_annotated = data.ix[relevant_ix]
data_all = data.ix[all_ix]

In [None]:
# preprocessing
text = []
stemmer = nltk.stem.porter.PorterStemmer()
for index, row in data_annotated.iterrows():
    full_text = row['status_message'].lower()
    tk_text = nltk.word_tokenize(full_text)
    stemmed = [stemmer.stem(token) for token in tk_text]
    text.append(stemmed)

In [None]:
# construct vocabulary with ttf
k = 10
ttf = {}
for words in text:
    for word in words:
        if word not in ttf:
            ttf[word] = 1
        else:
            ttf[word] += 1
vocab = [word for word in ttf.keys() if ttf[word] >= k and not re.search('\d', word)] # high word count and not date

In [None]:
# create feature vectors
X = []
for row in text:
    X_row = []
    for voc in vocab:
        if voc in row:
            X_row.append(1)
        else:
            X_row.append(0)
    X.append(X_row)
X = np.matrix(X)
y_driver = np.array(data_annotated['is_driver'])
y_roundtrip = np.array(data_annotated['is_roundtrip'])

In [None]:
# machine learning, driver or rider
clfs = [MultinomialNB(), BernoulliNB(), LogisticRegression(), LinearSVC(C=.1), DecisionTreeClassifier('entropy'), 
        RandomForestClassifier(), KNeighborsClassifier(n_neighbors=3)]
for clf in clfs:
    scores = cross_val_score(clf, X, y_driver, cv = 5)
    print(clf)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    print("--------------------")

In [None]:
# best classifier
clf_driver = MultinomialNB()
clf_driver.fit(X, y_driver)

In [None]:
# machine learning, round trip?
clfs = [MultinomialNB(), BernoulliNB(), LogisticRegression(C=.1), LinearSVC(C=.01), DecisionTreeClassifier('entropy'), 
        RandomForestClassifier(), KNeighborsClassifier(n_neighbors=3)]
for clf in clfs:
    scores = cross_val_score(clf, X, y_roundtrip, cv = 5)
    print(clf)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    print("--------------------")

In [None]:
clf_roundtrip = LinearSVC(C=.01)
clf_roundtrip.fit(X, y_roundtrip)

In [None]:
# preprocessing, spam
text = []
stemmer = nltk.stem.porter.PorterStemmer()
for index, row in data_all.iterrows():
    full_text = row['status_message'].lower()
    tk_text = nltk.word_tokenize(full_text)
    stemmed = [stemmer.stem(token) for token in tk_text]
    text.append(stemmed)

In [None]:
# construct vocabulary with ttf, spam
k = 10
ttf = {}
for words in text:
    for word in words:
        if word not in ttf:
            ttf[word] = 1
        else:
            ttf[word] += 1
vocab = [word for word in ttf.keys() if ttf[word] >= k and not re.search('\d', word)] # high word count and not date

In [None]:
# create feature vectors, spam
X = []
for row in text:
    X_row = []
    for voc in vocab:
        if voc in row:
            X_row.append(1)
        else:
            X_row.append(0)
    X.append(X_row)
X = np.matrix(X)
y_relevant = np.array(data_all['is_relevant'])

In [None]:
# machine learning, spam
clfs = [MultinomialNB(), BernoulliNB(), LogisticRegression(), LinearSVC(), DecisionTreeClassifier('entropy'), 
        RandomForestClassifier(criterion='entropy'), KNeighborsClassifier(n_neighbors=1)]
for clf in clfs:
    scores = cross_val_score(clf, X, y_relevant, cv = 5)
    print(clf)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    print("--------------------")

In [None]:
clf_relevant = LogisticRegression()
clf_relevant.fit(X, y_relevant)

In [None]:
print(clf_driver)
print(clf_roundtrip)
print(clf_relevant)
# use these classifiers on rest of dataset