In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression



In [2]:
filepath_dict= {'yelp':   './data/yelp_labelled.txt',
                 'amazon': './data/amazon_cells_labelled.txt',
                 'imdb':   './data/imdb_labelled.txt'}
df_list = []

In [3]:
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source 
    df_list.append(df)
df = pd.concat(df_list)
df.head(5)

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp


## For yelp Only

In [4]:
df_yelp = df[df['source'] == 'yelp']
X = df_yelp['sentence'].values
y = df_yelp['label'].values
sentences_train, sentences_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1000)



In [5]:
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)
X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)
X_train


<750x1714 sparse matrix of type '<class 'numpy.int64'>'
	with 7368 stored elements in Compressed Sparse Row format>

In [6]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
f'Accuracy: {score*100}%'

'Accuracy: 79.60000000000001%'

## for All Data 

In [7]:
classifier = LogisticRegression()
overall_accuracy = 0
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    overall_accuracy+=score
    print('Accuracy for {} data: {:.4f}%'.format(source, score*100))
avgAccuracy = (overall_accuracy / 3) *100
print(f"Total Avg Accuracy : {avgAccuracy.round(4)}%")


Accuracy for yelp data: 79.6000%
Accuracy for amazon data: 79.6000%
Accuracy for imdb data: 74.8663%
Total Avg Accuracy : 78.0221%


#### My tests on Amazon Data

In [8]:
sentences = ['Love that shit you scam people with', 'really impressive!', 'Hmm Not sure what to say about it', "if I were you I'll literally close this brand with satisfaction"]
x_test = vectorizer.transform(sentences)
classifier.predict(x_test)

array([1, 1, 0, 0])

# Conclusion:
With the last test I passed 4 vague sentences it identifies 3 of them correctly so yeah that's reasonable why its accuracy is around 75%
