In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer 

In [3]:
data = pd.read_csv("data.tsv", delimiter='	', header=0)
data['Sentiment'].value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

In [4]:
data = pd.read_csv("data.tsv", delimiter='	', header=0)
data = data[0:4990]
X = data[['PhraseId', 'SentenceId', 'Phrase']]
y = data['Sentiment']
stopWords = set(stopwords.words('english'))
ps = PorterStemmer()
data['Phrase'] = data['Phrase'].apply(lambda x: " ".join([item for item in (x.lower()).split(" ") if item not in stopWords]))
data['Phrase'] = data['Phrase'].apply(lambda x: ps.stem(x))
type(data)

pandas.core.frame.DataFrame

In [161]:
wordsbag = CountVectorizer()
X_wb = wordsbag.fit_transform(data['Phrase'])
print("Bag of Words:", X_wb.shape)
vectorizer = TfidfVectorizer()
X_tfIdf = vectorizer.fit_transform(data['Phrase'])
print("Words Vectorizer:", X_tfIdf.shape)

Bag of Words: (4990, 2071)
Words Vectorizer: (4990, 2071)


In [162]:
xTrain, xTest, yTrainLabels, yTestLabels = train_test_split(X_wb, y,test_size=0.2, random_state=123)
trainBW, testBW, trainLabelsBW, testLabelsBW = train_test_split(xTest, yTestLabels, test_size=0.3, random_state=123)

In [163]:
xTrain, xTest, yTrainLabels, yTestLabels = train_test_split(X_tfIdf, y,test_size=0.2, random_state=123)
trainTFIDF, testTFIDF, trainLabelsTFIDF, testLabelsTFIDF = train_test_split(xTest, yTestLabels, test_size=0.3, random_state=123)

In [164]:
clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
clf.fit(trainBW, trainLabelsBW)
predict = clf.predict(testBW)
print("Accuracy for wordsbag with Logistic Regression:")
print(accuracy_score(testLabelsBW, predict))

svc = SVC()
svc.fit(trainBW, trainLabelsBW)
predict = svc.predict(testBW)
print("Accuracy for wordsbag with SVC")
print(accuracy_score(testLabelsBW, predict))


Accuracy for wordsbag with Logistic Regression:
0.6766666666666666
Accuracy for wordsbag with SVC
0.6166666666666667


In [165]:
lg = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
lg.fit(trainTFIDF, trainLabelsTFIDF)
predict = lg.predict(testTFIDF)
print("Accuracy for vectorizer with Logistic Regression:")
print(accuracy_score(testLabelsTFIDF, predict))


svcc = SVC()
svcc.fit(trainTFIDF, trainLabelsTFIDF)
predict = svcc.predict(testTFIDF)
print("Accuracy for vectorizer with SVC")
print(accuracy_score(testLabelsTFIDF, predict))

Accuracy for vectorizer with Logistic Regression:
0.64
Accuracy for vectorizer with SVC
0.6166666666666667


In [7]:
from nltk.tokenize import sent_tokenize, word_tokenize
# nltk.download()

data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."
stopWords = set(stopwords.words('english'))
words = word_tokenize(data)
wordsFiltered = []
 
for w in words:
    print(ps.stem(w)) #stemming
    if w not in stopWords:
        wordsFiltered.append(w) #stopwords
print(wordsFiltered)

all
work
and
no
play
make
jack
dull
boy
.
all
work
and
no
play
make
jack
a
dull
boy
.
['All', 'work', 'play', 'makes', 'jack', 'dull', 'boy', '.', 'All', 'work', 'play', 'makes', 'jack', 'dull', 'boy', '.']
