In [None]:
import sklearn as sk
import os

In [None]:
path_neg_true = './database/spam/negative_polarity/truthful_from_Web'
path_neg_false = './database/spam/negative_polarity/deceptive_from_MTurk'

path_pos_true = './database/spam/positive_polarity/truthful_from_TripAdvisor'
path_pos_false = './database/spam/positive_polarity/deceptive_from_MTurk'
  
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk import ngrams
import string

def read_range(data,a,b,n, label):
    res = []
    for i in range(a,b):
        folder = 'fold'+ str(i)
        for file in os.listdir(data + "/" + folder):
             with open(data + '/' + folder + '/' + file, 'r') as content_file:
                content = content_file.read()
                res.append(content)
    return res

def read_dir(data,n, label):
  return read_range(data,1,5,n,label),read_range(data,5,6,n,label)

In [None]:
import pandas as pd
import numpy as np
import sklearn.metrics as metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
from nltk.classify import NaiveBayesClassifier
from sklearn.svm import SVC

scores = ['accuracy']
n=1
train_neg_true,test_neg_true = read_dir(path_neg_true,n, 'positive')
train_neg_false,test_neg_false = read_dir(path_neg_false,n,'negative')

dfTrue = pd.DataFrame(train_neg_true)
dfTrue['y'] = 1

dfFalse = pd.DataFrame(train_neg_false)
dfFalse['y'] = 0


dfTestTrue = pd.DataFrame(test_neg_true)
dfTestTrue['y']=1

dfTestFalse = pd.DataFrame(test_neg_false)
dfTestFalse['y'] = 0

frames = [dfTrue,dfFalse]
test_frames = [dfTestTrue,dfTestFalse]

df = pd.concat(frames)
test_df = pd.concat(test_frames)

#Naive Bayes classifier

In [None]:
text_clfMNB = Pipeline([ ('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
])

text_clfLR = Pipeline([ ('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', LogisticRegression())
])

text_clfCT = Pipeline([ ('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                     # ('clf', tree.DecisionTreeClassifier())
])

text_clfRF = Pipeline([ ('vect', CountVectorizer(max_features = 'sqrt')),
                      ('tfidf', TfidfTransformer(max_features = 'sqrt')),
                      ('clf', ExtraTreesClassifier(max_features = 'sqrt'))
])

In [None]:
from sklearn.metrics import confusion_matrix

def run_pipeline(text_clf,parameters):
    pipe = text_clf.fit(df[0],df['y'])  
    predicted = pipe.predict(test_df[0])
    gs_clf = GridSearchCV(text_clf, parameters, cv=5)
    gs_clf = gs_clf.fit(df[0],df['y'])
    predicted = gs_clf.predict(test_df[0])
    print(metrics.classification_report(test_df['y'], predicted))
    print(confusion_matrix(test_df['y'], predicted))
    print(gs_clf.best_params_)
    return gs_clf.best_estimator_

In [None]:
bi = True

arr = (1,1)

if bi:
    arr = (2,2)

parameters_MNB = {
           'vect__ngram_range': [arr],
           'tfidf__use_idf': (True, False),
           'clf__alpha': (1e-2, 1e-3),
}

parameters_DT = {
               'vect__ngram_range': [arr],
               'tfidf__use_idf': (True, False),
               'clf__max_depth': [999999]
}

parameters_LR = {
               'vect__ngram_range': [arr],
               'tfidf__use_idf': (True, False),
                'clf__C': [1,0.1,0.001,0.0001],
}

parameters_RF = {
               'vect__ngram_range': [arr],
               'tfidf__use_idf': (True, False),
               'clf__n_estimators' : [100, 200, 300]
}

In [None]:
print('--------Dec. Trees------')
best_tree = run_pipeline(text_clfCT,parameters_DT)

In [None]:
print('-------Random forests------')
import matplotlib.pyplot as plt
import numpy as np
best_forest = run_pipeline(text_clfRF,parameters_RF)
forest = best_forest.steps[len(best_forest.steps)-1][1]

importante = forest.feature_importances_
importances = [x for x in importante if x > 0.001]
indices = np.argsort(importances)[::-1]
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
print(indices)
print(importances)


In [None]:
print('--------Logistic regression-------')
run_pipeline(text_clfLR, parameters_LR)

In [None]:
print('--------Naive bayes-------')
run_pipeline(text_clfMNB, parameters_MNB)

In [None]:
import matplotlib.pyplot as plt

def bar(popularity_data,plot_name):
  plt.figure(figsize=(15,12))
  # sort in-place from highest to lowest
  popularity_data.sort(key=lambda x: x[1], reverse=True) 

  # save the names and their respective scores separately
  # reverse the tuples to go from most frequent to least frequent 
  data,score = zip(*popularity_data)
  x_pos = np.arange(len(data))
  
  #space bars apart
  for j in range(len(x_pos)):
    x_pos[j]=x_pos[j]+j
    
  plt.bar(x_pos, score,align='center')
  #plt.tick_params(axis='x', which='major', pad=15)
    
  plt.xticks(x_pos, data) 

  plt.ylabel(plot_name)
  
  plt.show()

print('-----PLOTING----------')
t = ['|Naive Bayes|',
     '|Logistic regression|',
    '|Class. Tree|',
    '|Rand. Forest|',
]
data = [0.89,0.86,0.63,0.82]
bar(list(zip(t,data)),'Average precision unigrams')

data = [0.79,0.62,0.84,0.81]
bar(list(zip(t,data)),'Average precision bigrams')
