In [55]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [56]:
filepath_dict = {'yelp':   'assets/yelp_labelled.txt',
                 'amazon': 'assets/amazon_cells_labelled.txt',
                 'imdb':   'assets/imdb_labelled.txt'}

In [57]:
df_list = []

In [58]:
filepath_dict.items()

dict_items([('yelp', 'assets/yelp_labelled.txt'), ('amazon', 'assets/amazon_cells_labelled.txt'), ('imdb', 'assets/imdb_labelled.txt')])

In [59]:
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source
    df_list.append(df)  

In [60]:
df_list[0].head()

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp


In [61]:
df = pd.concat(df_list)
df.head()

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp


In [62]:
df.tail()

Unnamed: 0,sentence,label,source
743,I just got bored watching Jessice Lange take h...,0,imdb
744,"Unfortunately, any virtue in this film's produ...",0,imdb
745,"In a word, it is embarrassing.",0,imdb
746,Exceptionally bad!,0,imdb
747,All in all its an insult to one's intelligence...,0,imdb


In [63]:
df_yelp = df[df['source'] == 'yelp']

In [64]:
x_yelp = df_yelp['sentence'].values
y_yelp = df_yelp['label'].values

In [65]:
x_train, x_test, y_train, y_test = train_test_split(x_yelp, y_yelp, test_size=0.25, random_state=1000)

In [66]:
 vectorizer = CountVectorizer()

In [67]:
vectorizer.fit(x_train)

CountVectorizer()

In [68]:
vectorizer.vocabulary_

{'the': 1494,
 'food': 597,
 'was': 1634,
 'barely': 125,
 'lukewarm': 888,
 'so': 1360,
 'it': 801,
 'must': 973,
 'have': 710,
 'been': 145,
 'sitting': 1345,
 'waiting': 1622,
 'for': 600,
 'server': 1308,
 'to': 1524,
 'bring': 201,
 'out': 1042,
 'us': 1587,
 'sorry': 1377,
 'will': 1674,
 'not': 1001,
 'be': 136,
 'getting': 638,
 'from': 616,
 'here': 725,
 'anytime': 63,
 'soon': 1372,
 'of': 1010,
 'all': 42,
 'dishes': 434,
 'salmon': 1265,
 'best': 155,
 'but': 223,
 'were': 1657,
 'great': 665,
 'fries': 615,
 'hot': 750,
 'and': 57,
 'neither': 988,
 'my': 974,
 'burger': 215,
 'in': 780,
 'fact': 540,
 'going': 650,
 'round': 1250,
 'up': 1583,
 'stars': 1403,
 'just': 818,
 'because': 143,
 'she': 1323,
 'awesome': 106,
 'go': 647,
 'back': 115,
 'next': 991,
 'trip': 1553,
 'this': 1511,
 'first': 577,
 'crawfish': 350,
 'experience': 531,
 'delicious': 393,
 'could': 336,
 'stomach': 1418,
 'meal': 913,
 'didn': 411,
 'complain': 312,
 'business': 220,
 'lunch': 889,
 

In [69]:
X_train=vectorizer.transform(x_train)
X_test=vectorizer.transform(x_test)

In [70]:
X_train

<750x1714 sparse matrix of type '<class 'numpy.int64'>'
	with 7368 stored elements in Compressed Sparse Row format>

In [71]:
X_test.data

array([1, 1, 1, ..., 1, 1, 1])

In [72]:
classifier = LogisticRegression()

In [73]:
classifier.fit(X_train, y_train)

LogisticRegression()

In [74]:
score = classifier.score(X_test, y_test)

In [75]:
print(f'Accureccy is {score:.4f}')

Accureccy is 0.7960


In [76]:
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))

Accuracy for yelp data: 0.7960
Accuracy for amazon data: 0.7960
Accuracy for imdb data: 0.7487


In [77]:
predict=classifier.predict(X_test)
predict

array([1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1])

In [78]:
y_test

array([1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1])

In [79]:
new_data = vectorizer.transform(['bad experience','loved it', 'the food is just bad', 'the food is so good'])
predict=classifier.predict(new_data)
predict

array([0, 1, 0, 1])