## Initalize labraries

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

#### Reading the files

In [5]:
filepath_dict = {'yelp':   'assets/yelp_labelled.txt',
                 'amazon': 'assets/amazon_cells_labelled.txt',
                 'imdb':   'assets/imdb_labelled.txt'}

In [7]:
df_list = []

In [9]:
filepath_dict.items()

dict_items([('yelp', 'assets/yelp_labelled.txt'), ('amazon', 'assets/amazon_cells_labelled.txt'), ('imdb', 'assets/imdb_labelled.txt')])

### Looping on files and extract text to clomns

In [11]:
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source
    df_list.append(df)  

### get first records

In [13]:
df_list[0].head()

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp


### append all test to array

In [15]:
df = pd.concat(df_list)
df.head()

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp


### reading the last records 

In [17]:
df.tail()

Unnamed: 0,sentence,label,source
743,I just got bored watching Jessice Lange take h...,0,imdb
744,"Unfortunately, any virtue in this film's produ...",0,imdb
745,"In a word, it is embarrassing.",0,imdb
746,Exceptionally bad!,0,imdb
747,All in all its an insult to one's intelligence...,0,imdb


### Get liner regression

In [19]:
df_yelp = df[df['source'] == 'yelp']

x_yelp = df_yelp['sentence'].values
y_yelp = df_yelp['label'].values

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x_yelp, y_yelp, test_size=0.25, random_state=1000)

### Get vectorizer

In [25]:
 vectorizer = CountVectorizer()

In [27]:
vectorizer.fit(x_train)

CountVectorizer()

In [29]:
vectorizer.vocabulary_

{'have': 804,
 'watched': 1881,
 'their': 1720,
 'prices': 1327,
 'inflate': 897,
 'portions': 1311,
 'get': 722,
 'smaller': 1564,
 'and': 61,
 'management': 1041,
 'attitudes': 107,
 'grow': 770,
 'rapidly': 1367,
 'find': 653,
 'wasting': 1879,
 'food': 677,
 'to': 1749,
 'be': 148,
 'despicable': 462,
 'but': 251,
 'this': 1734,
 'just': 935,
 'wasn': 1876,
 'first': 659,
 'time': 1744,
 'going': 737,
 'think': 1730,
 'will': 1917,
 'quickly': 1361,
 'become': 157,
 'regular': 1406,
 'cant': 271,
 'say': 1483,
 'enough': 574,
 'good': 741,
 'things': 1729,
 'about': 23,
 'place': 1285,
 'we': 1888,
 'got': 744,
 'sitting': 1556,
 'fairly': 620,
 'fast': 631,
 'ended': 569,
 'up': 1818,
 'waiting': 1863,
 '40': 14,
 'minutes': 1090,
 'our': 1203,
 'order': 1194,
 'another': 65,
 '30': 11,
 'before': 162,
 'the': 1718,
 'arrived': 90,
 'fried': 694,
 'rice': 1435,
 'was': 1874,
 'dry': 536,
 'as': 94,
 'well': 1897,
 'chef': 306,
 'generous': 720,
 'with': 1924,
 'his': 835,
 'even':

In [31]:
X_train=vectorizer.transform(x_train)
X_test=vectorizer.transform(x_test)

In [33]:
X_train

<1500x1965 sparse matrix of type '<class 'numpy.int64'>'
	with 14582 stored elements in Compressed Sparse Row format>

In [35]:
X_test.data

array([1, 1, 1, ..., 2, 1, 2])

In [37]:
classifier = LogisticRegression()

In [39]:
classifier.fit(X_train, y_train)

LogisticRegression()

In [41]:
score = classifier.score(X_test, y_test)

In [43]:
print(f'Accureccy is {score:.4f}')

Accureccy is 0.9260


### Get Accuarcy and predict

In [45]:
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))

Accuracy for yelp data: 0.9260
Accuracy for amazon data: 0.9380
Accuracy for imdb data: 0.9225


In [47]:
predict=classifier.predict(X_test)
predict

array([1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,

In [49]:
y_test

array([1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,

In [51]:
new_data = vectorizer.transform(['bad experience','loved it', 'the food is just bad', 'the food is so good'])
predict=classifier.predict(new_data)
predict

array([0, 1, 0, 1])