In [7]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

## `Loading data and extracting it`

In [6]:
my_files={
    'yelp':'yelp_labelled.txt',
    'amazon':'amazon_cells_labelled.txt',
    'imdb':'imdb_labelled.txt'
}
df_list = []

for source,filepath in my_files.items():
    df=pd.read_csv(filepath,names=['sentance','label'],sep='\t')
    df['source']=source
    df_list.append(df)
df=pd.concat(df_list)
df


Unnamed: 0,sentance,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp
...,...,...,...
743,I just got bored watching Jessice Lange take h...,0,imdb
744,"Unfortunately, any virtue in this film's produ...",0,imdb
745,"In a word, it is embarrassing.",0,imdb
746,Exceptionally bad!,0,imdb


## `Split the data into a training and testing set` 

In [58]:
# Filter based on yelp source or to get the sentances just from yelp
df_yelp=df[df['source']=='yelp']
# Convert to numpy array, try to remove `values` you get a series instead
sentances=df_yelp['sentance'].values
y=df_yelp['label'].values
# Spliting data into train and test
sentances_train,sentances_test,y_train,y_test=train_test_split(sentances,y,test_size=0.25,random_state=100)

## **`Vectorize the sentences using`** `CountVectorizer`

In [59]:
vectorizer=CountVectorizer()
vectorizer.fit(sentances_train)
X_train=vectorizer.transform(sentances_train)
X_test=vectorizer.transform(sentances_test)

## `Using logistic regression fromsklearn library`

In [60]:
from sklearn.linear_model import LogisticRegression

classifier=LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print(f'Our accruracy for this model is {score*100:.1f}%')

Our accruracy for this model is 80.8%


## `Lets take a look for the other data set we have`

In [62]:
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentance'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=100)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print(f'Accuracy for {source} Data is {score*100:.1f}%')

Accuracy for yelp Data is 80.8%
Accuracy for amazon Data is 82.0%
Accuracy for imdb Data is 73.8%
