## import pandas library and sklearn

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

## Put files path

In [13]:
filepath_dict = {'yelp':   'yelp_labelled.txt',
                 'amazon': 'amazon_cells_labelled.txt',
                 'imdb':   'imdb_labelled.txt'}

## Read the data

In [12]:
df_list = [] 

for source, path in filepath_dict.items():
    df = pd.read_csv(path, names=['sentence', 'label'], sep='\t')
    df['source'] = source 
    df_list.append(df)

df = pd.concat(df_list)
print(df.iloc[0])

sentence    Wow... Loved this place.
label                              1
source                          yelp
Name: 0, dtype: object


## Declare vectorizer

In [18]:
sentences = ['John likes ice cream', 'John hates chocolate.']

In [19]:
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer

CountVectorizer(lowercase=False, min_df=0)

## Fitting the data 

In [10]:
vectorizer.fit(sentences)
vectorizer.vocabulary_

{'John': 0, 'likes': 5, 'ice': 4, 'cream': 2, 'hates': 3, 'chocolate': 1}

## Yelp The Data

In [21]:
df_yelp = df[df['source'] == 'yelp']
df_yelp

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp
...,...,...,...
995,I think food should have flavor and texture an...,0,yelp
996,Appetite instantly gone.,0,yelp
997,Overall I was not impressed and would not go b...,0,yelp
998,"The whole experience was underwhelming, and I ...",0,yelp


In [41]:
sentences = df_yelp['sentence'].values

In [None]:
y = df_yelp['label'].values

In [37]:
x_train , x_test , y_train , y_test = train_test_split(sentences, y, test_size=0.6, random_state=100)

In [28]:
vectorizer = CountVectorizer()
vectorizer

vectorizer.fit(x_train)
vectorizer

CountVectorizer()

In [29]:
X_train = vectorizer.transform(x_train)
X_train

<400x1185 sparse matrix of type '<class 'numpy.int64'>'
	with 3911 stored elements in Compressed Sparse Row format>

In [30]:
X_test  = vectorizer.transform(x_test)
X_test

<600x1185 sparse matrix of type '<class 'numpy.int64'>'
	with 4789 stored elements in Compressed Sparse Row format>

In [39]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
score

print("Accuracy = ", score)

Accuracy =  0.79


In [40]:
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))

Accuracy for yelp data: 0.7960
Accuracy for amazon data: 0.7960
Accuracy for imdb data: 0.7487
