In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

import string
import re

[nltk_data] Downloading package stopwords to /content/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
!pip install -U -q PyDrive

In [3]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [4]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [5]:
file_list = drive.ListFile({'q': "'1DKJEpwYEPfGzEOzZZ1TwVM_pZIc4RnbF' in parents and trashed=false"}).GetList()
for file1 in file_list:
  print('title: %s, id: %s' % (file1['title'], file1['id']))

title: train.csv, id: 1piDuPiqjKJzF2W1g134KlnKFCaGJNId6


In [6]:
train_downloaded = drive.CreateFile({'id': '1piDuPiqjKJzF2W1g134KlnKFCaGJNId6'})
train_downloaded.GetContentFile('train.csv')

In [7]:
import pandas as pd

data = pd.read_csv('train.csv')

In [8]:
data.head(5)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
id        20800 non-null int64
title     20242 non-null object
author    18843 non-null object
text      20761 non-null object
label     20800 non-null int64
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html


In [10]:
def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"[^a-zA-z\s]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    ##remove less than 4 letters
    text = re.sub(r'\b\w{1,3}\b', '', text)
    ## Stemming
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)
    
    return text


In [11]:
#data prep
data=data.fillna(' ')
data['all'] = data['title']+' '+ data['author'] +' '+ data['text']

## apply the above function to data['all']
data['all'] = data['all'].map(lambda x: clean_text(x))

In [12]:
targets = data['label'].values

In [13]:
#tfidf

transformer = TfidfTransformer(smooth_idf=False)
count_vectorizer = CountVectorizer(ngram_range=(1, 3))
counts = count_vectorizer.fit_transform(data['all'].values)
features = transformer.fit_transform(counts)

In [14]:
features.shape[1]

10429999

In [15]:
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

#Cross Validation
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

In [16]:
#X and Y

X = features
Y = targets

In [17]:
#Naive Bayes

NB = MultinomialNB()
cvscores = []
for train, test in kfold.split(X, Y):
  NB.fit(X[train], Y[train])
  scores = NB.score(X[test], Y[test])
  cvscores.append(scores * 100)

print("ACC %.2f%% (std +/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

ACC 84.92% (std +/- 0.77%)


In [18]:
#AdaBoost

Adab= AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),n_estimators=5)
cvscores = []
for train, test in kfold.split(X, Y):
  Adab.fit(X[train], Y[train])
  scores = NB.score(X[test], Y[test])
  cvscores.append(scores * 100)

print("ACC %.2f%% (std +/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

ACC 95.70% (std +/- 4.03%)


In [0]:
#top 20000 features 
count_vectorizer = CountVectorizer(ngram_range=(1, 3), max_features=20000)
count_vectorizer.fit(data['all'].values)
doc_array = count_vectorizer.transform(data['all'].values).toarray()
frequency_matrix = pd.DataFrame(doc_array, columns=count_vectorizer.get_feature_names())

In [0]:
#X and Y

X = frequency_matrix.values
Y = targets

In [0]:
#Naive Bayes

NB = MultinomialNB()
cvscores = []
for train, test in kfold.split(X, Y):
  NB.fit(X[train], Y[train])
  scores = NB.score(X[test], Y[test])
  cvscores.append(scores * 100)

print("ACC %.2f%% (std +/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

ACC 91.72% (std +/- 0.45%)


In [0]:
#AdaBoost

Adab= AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),n_estimators=5)
cvscores = []
for train, test in kfold.split(X, Y):
  Adab.fit(X[train], Y[train])
  scores = NB.score(X[test], Y[test])
  cvscores.append(scores * 100)

print("ACC %.2f%% (std +/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

ACC 92.72% (std +/- 0.59%)


In [0]:
#top 18000 features 
count_vectorizer = CountVectorizer(ngram_range=(1, 3), max_features=18000)
count_vectorizer.fit(data['all'].values)
doc_array = count_vectorizer.transform(data['all'].values).toarray()
frequency_matrix = pd.DataFrame(doc_array, columns=count_vectorizer.get_feature_names())

In [0]:
#X and Y

X = frequency_matrix.values
Y = targets

In [16]:
#Naive Bayes

NB = MultinomialNB()
cvscores = []
for train, test in kfold.split(X, Y):
  NB.fit(X[train], Y[train])
  scores = NB.score(X[test], Y[test])
  cvscores.append(scores * 100)

print("ACC %.2f%% (std +/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

ACC 91.63% (std +/- 0.48%)


In [17]:
#AdaBoost

Adab= AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),n_estimators=5)
cvscores = []
for train, test in kfold.split(X, Y):
  Adab.fit(X[train], Y[train])
  scores = NB.score(X[test], Y[test])
  cvscores.append(scores * 100)

print("ACC %.2f%% (std +/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

ACC 92.50% (std +/- 0.61%)


In [0]:
#top 15000 features 
count_vectorizer = CountVectorizer(ngram_range=(1, 3), max_features=15000)
count_vectorizer.fit(data['all'].values)
doc_array = count_vectorizer.transform(data['all'].values).toarray()
frequency_matrix = pd.DataFrame(doc_array, columns=count_vectorizer.get_feature_names())

In [0]:
#X and Y

X = frequency_matrix.values
Y = targets

In [16]:
#Naive Bayes

NB = MultinomialNB()
cvscores = []
for train, test in kfold.split(X, Y):
  NB.fit(X[train], Y[train])
  scores = NB.score(X[test], Y[test])
  cvscores.append(scores * 100)

print("ACC %.2f%% (std +/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

ACC 91.26% (std +/- 0.44%)


In [17]:
#AdaBoost

Adab= AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),n_estimators=5)
cvscores = []
for train, test in kfold.split(X, Y):
  Adab.fit(X[train], Y[train])
  scores = NB.score(X[test], Y[test])
  cvscores.append(scores * 100)

print("ACC %.2f%% (std +/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

ACC 92.06% (std +/- 0.46%)


In [0]:
#top 10000 features 
count_vectorizer = CountVectorizer(ngram_range=(1, 3), max_features=10000)
count_vectorizer.fit(data['all'].values)
doc_array = count_vectorizer.transform(data['all'].values).toarray()
frequency_matrix = pd.DataFrame(doc_array, columns=count_vectorizer.get_feature_names())

In [0]:
#X and Y

X = frequency_matrix.values
Y = targets

In [16]:
#Naive Bayes

NB = MultinomialNB()
cvscores = []
for train, test in kfold.split(X, Y):
  NB.fit(X[train], Y[train])
  scores = NB.score(X[test], Y[test])
  cvscores.append(scores * 100)

print("ACC %.2f%% (std +/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

ACC 90.13% (std +/- 0.65%)


In [17]:
#AdaBoost

Adab= AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),n_estimators=5)
cvscores = []
for train, test in kfold.split(X, Y):
  Adab.fit(X[train], Y[train])
  scores = NB.score(X[test], Y[test])
  cvscores.append(scores * 100)

print("ACC %.2f%% (std +/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

ACC 90.83% (std +/- 0.58%)


In [0]:
#top 8000 features 
count_vectorizer = CountVectorizer(ngram_range=(1, 3), max_features=8000)
count_vectorizer.fit(data['all'].values)
doc_array = count_vectorizer.transform(data['all'].values).toarray()
frequency_matrix = pd.DataFrame(doc_array, columns=count_vectorizer.get_feature_names())

In [0]:
#X and Y

X = frequency_matrix.values
Y = targets

In [20]:
#Naive Bayes

NB = MultinomialNB()
cvscores = []
for train, test in kfold.split(X, Y):
  NB.fit(X[train], Y[train])
  scores = NB.score(X[test], Y[test])
  cvscores.append(scores * 100)

print("ACC %.2f%% (std +/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

ACC 89.74% (std +/- 0.63%)


In [21]:
#AdaBoost

Adab= AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),n_estimators=5)
cvscores = []
for train, test in kfold.split(X, Y):
  Adab.fit(X[train], Y[train])
  scores = NB.score(X[test], Y[test])
  cvscores.append(scores * 100)

print("ACC %.2f%% (std +/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

ACC 90.31% (std +/- 0.59%)
