### Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (15, 7)
import pandas as pd

### Importing the dataset

In [2]:
# ignore double quotes with "quoting = 3"
dataset = pd.read_csv('TextData_One.csv', header=None)
dataset.head()

Unnamed: 0,0,1,2,3,4,5
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...


In [3]:
dataset.describe()

Unnamed: 0,0,1
count,498.0,498.0
mean,2.02008,1867.226908
std,1.699686,2834.891681
min,0.0,3.0
25%,0.0,388.25
50%,2.0,1013.5
75%,4.0,2366.75
max,4.0,14076.0


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 498 entries, 0 to 497
Data columns (total 6 columns):
0    498 non-null int64
1    498 non-null int64
2    498 non-null object
3    498 non-null object
4    498 non-null object
5    498 non-null object
dtypes: int64(2), object(4)
memory usage: 23.4+ KB


### Cleaning the texts, just first text 
create a bag of words; tokenisation process split each review into different relevant words then atribute each word to a column with the amount of times that word shows up  
  
get only relevant words, i.e. non useful words like;'the', 'on', 'is', '...', '1' etc would be removed  
apply steming; taking the root of word e.g. loved -> love, so as not to have too many words  
get rid of capitals  

In [5]:
dataset[5][0]

'@stellargirl I loooooooovvvvvveee my Kindle2. Not that the DX is cool, but the 2 is fantastic in its own right.'

In [6]:
# remove characters that are not a-z
import re
review = re.sub('[^a-zA-Z]', ' ', dataset[5][0])
review

' stellargirl I loooooooovvvvvveee my Kindle   Not that the DX is cool  but the   is fantastic in its own right '

In [7]:
# make all the letters lowercase
review = review.lower()
review

' stellargirl i loooooooovvvvvveee my kindle   not that the dx is cool  but the   is fantastic in its own right '

In [8]:
# convert string into list of words
review = review.split()
review

['stellargirl',
 'i',
 'loooooooovvvvvveee',
 'my',
 'kindle',
 'not',
 'that',
 'the',
 'dx',
 'is',
 'cool',
 'but',
 'the',
 'is',
 'fantastic',
 'in',
 'its',
 'own',
 'right']

In [9]:
# get rid of irrelevant words like;'the', 'on', 'is', '...', '1' etc
import nltk

# one could download list of all possible irrelevant words i.e.'stopwords' from 'nltk' 
#nltk.download('stopwords') 
# then import 'stopwords' from 'nltk' to use
from nltk.corpus import stopwords

In [10]:
# apply steming; taking the root of a word e.g. loved -> love, so as not to have too many words
# so as to aviod too much sparsity
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [11]:
# loop through this list of words to remove irrelevant words while steming each
# set function enables the below to run faster
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review

['stellargirl', 'loooooooovvvvvvee', 'kindl', 'dx', 'cool', 'fantast', 'right']

In [12]:
# convert list of words into string seperated by spaces
review = ' '.join(review)
review

'stellargirl loooooooovvvvvvee kindl dx cool fantast right'

### Cleaning the texts
perform text cleaning process on all the text in the dataset

In [13]:
#corpus is a collection of text of the same type
corpus = []
for i in range(0, 498):
    review = re.sub('[^a-zA-Z]', ' ', dataset[5][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [14]:
pd.DataFrame(corpus).tail()

Unnamed: 0,0
493,ask program latex indesign submit calcio link ...
494,note hate word hate page hate latex said hate ...
495,ahhh back real text edit environ lt latex
496,troubl iran see hmm iran iran far away flockof...
497,read tweet come iran whole thing terrifi incre...


### Creating Bag of Words Model
take all the unique words to create one column for each word while minimising the amount of words

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1200) # transforms to sparsed matrix
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 0].values

In [16]:
pd.DataFrame(y).head()

Unnamed: 0,0
0,4
1,4
2,4
3,4
4,4


### Splitting the dataset into the Training set and Test set

In [17]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)



# Fitting Naive Bayes to the dataset
Naive bayes classifies by calculating the probability of being classified in a given category giving the feature (IVs)

In [18]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None)

### Predicting a new result with Naive Bayes
All models are wrong but some are useful...

In [19]:
y_pred_train = classifier.predict(X_train)
y_pred_test = classifier.predict(X_test)
y_pred_test

array([0, 2, 2, 0, 4, 4, 0, 4, 2, 0, 0, 0, 2, 4, 2, 4, 4, 4, 0, 0, 4, 4, 4,
       4, 0, 2, 4, 2, 0, 2, 2, 4, 4, 0, 4, 4, 2, 4, 4, 4, 4, 2, 2, 0, 2, 2,
       0, 4, 2, 0, 2, 0, 0, 0, 4, 4, 4, 2, 2, 4, 4, 4, 0, 4, 0, 4, 4, 0, 0,
       0, 0, 2, 0, 2, 0, 0, 2, 4, 4, 4, 2, 4, 4, 2, 0, 2, 2, 4, 4, 0, 4, 0,
       2, 2, 4, 0, 4, 4, 4, 2])

In [20]:
y_test

array([0, 4, 2, 4, 0, 4, 4, 4, 2, 2, 2, 0, 2, 4, 2, 4, 2, 4, 0, 0, 4, 2, 0,
       4, 2, 2, 4, 2, 0, 0, 2, 4, 4, 4, 4, 4, 2, 2, 2, 4, 4, 2, 2, 0, 2, 4,
       0, 4, 2, 0, 0, 0, 0, 0, 4, 4, 4, 2, 0, 4, 0, 4, 0, 4, 0, 4, 2, 0, 0,
       2, 0, 0, 2, 0, 2, 2, 4, 0, 4, 4, 2, 2, 2, 2, 0, 2, 4, 2, 4, 0, 2, 2,
       0, 2, 4, 0, 4, 2, 4, 4])

### Confusion Matrix

In [21]:
from sklearn import metrics
cm = metrics.confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(y_train, y_pred_train)))
print("")

print("Classification Report")
print(metrics.classification_report(y_train, y_pred_train))

Confusion Matrix
[[138   2   8]
 [  0 104   0]
 [  0   3 143]]

Classification Report
             precision    recall  f1-score   support

          0       1.00      0.93      0.97       148
          2       0.95      1.00      0.98       104
          4       0.95      0.98      0.96       146

avg / total       0.97      0.97      0.97       398



In [22]:
from sklearn import metrics
cm = metrics.confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(y_test, y_pred_test)))
print("")

print("Classification Report")
print(metrics.classification_report(y_test, y_pred_test))

Confusion Matrix
[[19  6  4]
 [ 8 17 10]
 [ 3  5 28]]

Classification Report
             precision    recall  f1-score   support

          0       0.63      0.66      0.64        29
          2       0.61      0.49      0.54        35
          4       0.67      0.78      0.72        36

avg / total       0.64      0.64      0.63       100



# Fitting Decision Tree to the dataset

In [23]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

### Predicting a new result with Decision Tree
All models are wrong but some are useful...

In [24]:
y_pred_train = classifier.predict(X_train)
y_pred_test = classifier.predict(X_test)
y_pred_test

array([0, 4, 2, 4, 4, 4, 4, 4, 2, 0, 4, 2, 2, 4, 2, 4, 2, 4, 0, 0, 0, 2, 2,
       4, 0, 2, 4, 4, 0, 2, 2, 4, 4, 4, 4, 4, 2, 2, 4, 4, 4, 0, 2, 4, 2, 4,
       0, 4, 2, 0, 2, 0, 0, 0, 2, 0, 0, 4, 0, 4, 4, 0, 0, 0, 4, 4, 2, 0, 0,
       4, 0, 4, 0, 0, 2, 4, 2, 0, 4, 4, 2, 4, 2, 2, 0, 2, 2, 2, 4, 0, 0, 2,
       2, 2, 2, 0, 2, 4, 2, 4])

In [25]:
y_test

array([0, 4, 2, 4, 0, 4, 4, 4, 2, 2, 2, 0, 2, 4, 2, 4, 2, 4, 0, 0, 4, 2, 0,
       4, 2, 2, 4, 2, 0, 0, 2, 4, 4, 4, 4, 4, 2, 2, 2, 4, 4, 2, 2, 0, 2, 4,
       0, 4, 2, 0, 0, 0, 0, 0, 4, 4, 4, 2, 0, 4, 0, 4, 0, 4, 0, 4, 2, 0, 0,
       2, 0, 0, 2, 0, 2, 2, 4, 0, 4, 4, 2, 2, 2, 2, 0, 2, 4, 2, 4, 0, 2, 2,
       0, 2, 4, 0, 4, 2, 4, 4])

### Confusion Matrix

In [26]:
from sklearn import metrics
cm = metrics.confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(y_train, y_pred_train)))
print("")

print("Classification Report")
print(metrics.classification_report(y_train, y_pred_train))

Confusion Matrix
[[148   0   0]
 [  0 104   0]
 [  0   0 146]]

Classification Report
             precision    recall  f1-score   support

          0       1.00      1.00      1.00       148
          2       1.00      1.00      1.00       104
          4       1.00      1.00      1.00       146

avg / total       1.00      1.00      1.00       398



In [27]:
from sklearn import metrics
cm = metrics.confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(y_test, y_pred_test)))
print("")

print("Classification Report")
print(metrics.classification_report(y_test, y_pred_test))

Confusion Matrix
[[19  5  5]
 [ 5 22  8]
 [ 5  6 25]]

Classification Report
             precision    recall  f1-score   support

          0       0.66      0.66      0.66        29
          2       0.67      0.63      0.65        35
          4       0.66      0.69      0.68        36

avg / total       0.66      0.66      0.66       100



# Fitting Random Forest to the dataset
All models are wrong but some are useful...

In [28]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 15, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

### Predicting a new result with Random Forest
All models are wrong but some are useful...

In [29]:
y_pred_train = classifier.predict(X_train)
y_pred_test = classifier.predict(X_test)
y_pred_test

array([0, 4, 2, 4, 4, 0, 4, 2, 2, 0, 4, 2, 2, 4, 2, 4, 2, 4, 0, 0, 0, 4, 2,
       4, 0, 2, 4, 4, 4, 2, 0, 4, 4, 4, 4, 4, 2, 2, 4, 4, 4, 0, 2, 4, 2, 4,
       0, 4, 2, 0, 4, 0, 0, 0, 4, 4, 4, 4, 0, 4, 4, 4, 0, 4, 4, 0, 2, 0, 0,
       2, 0, 4, 0, 0, 2, 4, 0, 2, 4, 4, 2, 4, 2, 2, 0, 2, 2, 2, 2, 0, 4, 2,
       2, 2, 2, 0, 2, 4, 4, 4])

In [30]:
y_test

array([0, 4, 2, 4, 0, 4, 4, 4, 2, 2, 2, 0, 2, 4, 2, 4, 2, 4, 0, 0, 4, 2, 0,
       4, 2, 2, 4, 2, 0, 0, 2, 4, 4, 4, 4, 4, 2, 2, 2, 4, 4, 2, 2, 0, 2, 4,
       0, 4, 2, 0, 0, 0, 0, 0, 4, 4, 4, 2, 0, 4, 0, 4, 0, 4, 0, 4, 2, 0, 0,
       2, 0, 0, 2, 0, 2, 2, 4, 0, 4, 4, 2, 2, 2, 2, 0, 2, 4, 2, 4, 0, 2, 2,
       0, 2, 4, 0, 4, 2, 4, 4])

### Confusion Matrix

In [31]:
from sklearn import metrics
cm = metrics.confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(y_train, y_pred_train)))
print("")

print("Classification Report")
print(metrics.classification_report(y_train, y_pred_train))

Confusion Matrix
[[147   0   1]
 [  2 102   0]
 [  0   0 146]]

Classification Report
             precision    recall  f1-score   support

          0       0.99      0.99      0.99       148
          2       1.00      0.98      0.99       104
          4       0.99      1.00      1.00       146

avg / total       0.99      0.99      0.99       398



In [32]:
from sklearn import metrics
cm = metrics.confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(y_test, y_pred_test)))
print("")

print("Classification Report")
print(metrics.classification_report(y_test, y_pred_test))

Confusion Matrix
[[17  5  7]
 [ 5 21  9]
 [ 4  5 27]]

Classification Report
             precision    recall  f1-score   support

          0       0.65      0.59      0.62        29
          2       0.68      0.60      0.64        35
          4       0.63      0.75      0.68        36

avg / total       0.65      0.65      0.65       100



# Fitting K-NN to the dataset
All models are wrong but some are useful...

In [33]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

### Predicting a new result with K-NN

In [34]:
y_pred_train = classifier.predict(X_train)
y_pred_test = classifier.predict(X_test)
y_pred_test

array([0, 4, 2, 4, 0, 4, 0, 2, 0, 0, 0, 4, 2, 0, 0, 0, 0, 4, 0, 0, 0, 2, 0,
       0, 2, 0, 4, 4, 0, 0, 0, 2, 4, 4, 2, 4, 2, 0, 0, 4, 2, 2, 2, 0, 4, 4,
       0, 4, 2, 0, 4, 0, 0, 0, 4, 0, 2, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0,
       0, 0, 0, 0, 0, 2, 2, 0, 0, 2, 0, 2, 0, 2, 0, 0, 0, 2, 2, 0, 4, 0, 0,
       0, 2, 2, 0, 2, 0, 2, 0])

In [35]:
y_test

array([0, 4, 2, 4, 0, 4, 4, 4, 2, 2, 2, 0, 2, 4, 2, 4, 2, 4, 0, 0, 4, 2, 0,
       4, 2, 2, 4, 2, 0, 0, 2, 4, 4, 4, 4, 4, 2, 2, 2, 4, 4, 2, 2, 0, 2, 4,
       0, 4, 2, 0, 0, 0, 0, 0, 4, 4, 4, 2, 0, 4, 0, 4, 0, 4, 0, 4, 2, 0, 0,
       2, 0, 0, 2, 0, 2, 2, 4, 0, 4, 4, 2, 2, 2, 2, 0, 2, 4, 2, 4, 0, 2, 2,
       0, 2, 4, 0, 4, 2, 4, 4])

### Confusion Matrix
checking for over fitting by comparing trained and tested matrix

In [36]:
from sklearn import metrics
cm = metrics.confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(y_train, y_pred_train)))
print("")

print("Classification Report")
print(metrics.classification_report(y_train, y_pred_train))

Confusion Matrix
[[128  12   8]
 [ 24  69  11]
 [ 49  18  79]]

Classification Report
             precision    recall  f1-score   support

          0       0.64      0.86      0.73       148
          2       0.70      0.66      0.68       104
          4       0.81      0.54      0.65       146

avg / total       0.71      0.69      0.69       398



In [37]:
from sklearn import metrics
cm = metrics.confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(y_test, y_pred_test)))
print("")

print("Classification Report")
print(metrics.classification_report(y_test, y_pred_test))

Confusion Matrix
[[25  1  3]
 [19 14  2]
 [12 12 12]]

Classification Report
             precision    recall  f1-score   support

          0       0.45      0.86      0.59        29
          2       0.52      0.40      0.45        35
          4       0.71      0.33      0.45        36

avg / total       0.57      0.51      0.49       100



### Fitting Support Vector Machine (SVM) to the dataset

In [38]:
from sklearn.svm import SVC
classifier = SVC(kernel='linear', random_state=0)
classifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

### Predicting a new result with Support Vector Machine (SVM)

In [39]:
y_pred_train = classifier.predict(X_train)
y_pred_test = classifier.predict(X_test)
y_pred_test

array([0, 4, 2, 4, 4, 4, 0, 2, 0, 0, 4, 4, 2, 0, 2, 4, 4, 4, 0, 0, 0, 4, 2,
       0, 0, 2, 4, 4, 0, 2, 2, 4, 4, 4, 4, 4, 2, 2, 4, 4, 4, 2, 2, 2, 2, 4,
       0, 4, 2, 0, 2, 0, 0, 0, 4, 4, 4, 4, 0, 4, 4, 0, 0, 4, 0, 0, 2, 0, 0,
       2, 0, 4, 0, 0, 2, 4, 0, 0, 4, 4, 2, 0, 2, 2, 0, 2, 2, 4, 2, 4, 4, 2,
       2, 2, 4, 0, 4, 4, 2, 4])

In [40]:
y_test

array([0, 4, 2, 4, 0, 4, 4, 4, 2, 2, 2, 0, 2, 4, 2, 4, 2, 4, 0, 0, 4, 2, 0,
       4, 2, 2, 4, 2, 0, 0, 2, 4, 4, 4, 4, 4, 2, 2, 2, 4, 4, 2, 2, 0, 2, 4,
       0, 4, 2, 0, 0, 0, 0, 0, 4, 4, 4, 2, 0, 4, 0, 4, 0, 4, 0, 4, 2, 0, 0,
       2, 0, 0, 2, 0, 2, 2, 4, 0, 4, 4, 2, 2, 2, 2, 0, 2, 4, 2, 4, 0, 2, 2,
       0, 2, 4, 0, 4, 2, 4, 4])

### Confusion Matrix
checking for over fitting by comparing trained and tested matrix

In [41]:
from sklearn import metrics
cm = metrics.confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(y_train, y_pred_train)))
print("")

print("Classification Report")
print(metrics.classification_report(y_train, y_pred_train))

Confusion Matrix
[[147   0   1]
 [  0 104   0]
 [  0   1 145]]

Classification Report
             precision    recall  f1-score   support

          0       1.00      0.99      1.00       148
          2       0.99      1.00      1.00       104
          4       0.99      0.99      0.99       146

avg / total       0.99      0.99      0.99       398



In [42]:
cm = metrics.confusion_matrix(y_test, y_pred_test)
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(y_test, y_pred_test)))
print("")

print("Classification Report")
print(metrics.classification_report(y_test, y_pred_test))

Confusion Matrix
[[19  5  5]
 [ 5 20 10]
 [ 7  4 25]]

Classification Report
             precision    recall  f1-score   support

          0       0.61      0.66      0.63        29
          2       0.69      0.57      0.62        35
          4       0.62      0.69      0.66        36

avg / total       0.64      0.64      0.64       100

