In [0]:
import numpy as np
import pandas as pd

In [104]:
from google.colab import drive
drive.mount('gdrive')

Drive already mounted at gdrive; to attempt to forcibly remount, call drive.mount("gdrive", force_remount=True).


In [0]:
# Read data
df  = pd.read_csv('gdrive/My Drive/FTMLE - Tonga/Data/movie_review.csv', encoding='utf-8', sep='\t')

In [119]:
# show first five rows of dataset
df.head()

Unnamed: 0,id,review,sentiment
0,5814_8,With all this stuff going down at the moment w...,1
1,2381_9,"\The Classic War of the Worlds\"" by Timothy Hi...",1
2,7759_3,The film starts with a manager (Nicholas Bell)...,0
3,3630_4,It must be assumed that those who praised this...,0
4,9495_8,Superbly trashy and wondrously unpretentious 8...,1


In [163]:
# dataset includes three column is ID, review and sentiment
# with about 22500 rows.
# dataset talk about a lot of reviews and the positive and negative of these reviews
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22500 entries, 0 to 22499
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         22500 non-null  object
 1   review     22500 non-null  object
 2   sentiment  22500 non-null  int64 
 3   score      22500 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 703.2+ KB



**Step 1: Creat TfidfVectorizer**


In [120]:
# Down stopword from nltk and import stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
import re

# Define PorterStemmer as porter and stopwords as stop variable
porter = PorterStemmer()

stop = stopwords.words('english')

def tokenizer_porter(text):
    # Your code here
    return [porter.stem(word) for word in text.split()]

def preprocessor(text):
    """ Return a cleaned version of text
    """
    # Remove HTML markup
    text = re.sub('<[^>]*>', '', text)
    # Save emoticons for later appending
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    # Remove any non-word character and append the emoticons,
    # removing the nose character for standarization. Convert to lower case
    text = (re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', ''))
    
    return text

tfidf = TfidfVectorizer(stop_words=stop,
                        tokenizer=tokenizer_porter,
                        preprocessor=preprocessor)


In [116]:
# Check vector tfidf
tfidf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2',
                preprocessor=<function preprocessor at 0x7f22fb6f4f28>,
                smooth_idf=True,
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...],
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function tokenizer_porter at 0x7f22fb6f4ea0>,
               

In [0]:
# Remove special characters
df['review'] = df['review'].apply(preprocessor)

**Step 2:classify**

In [0]:
from sklearn.model_selection import train_test_split
# split the dataset and get data train and test
X = df['review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.2)

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# I use Pipeline to help handle data train and test
clf = Pipeline([('vect', tfidf),
                ('clf', LogisticRegression(random_state=0))])

clf.fit(X_train, y_train)


In [164]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Import  three metrics to evaluate my model
predictions = clf.predict(X_test)
# Accuracy about
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

0.8904444444444445
              precision    recall  f1-score   support

           0       0.90      0.87      0.89      2221
           1       0.88      0.91      0.89      2279

    accuracy                           0.89      4500
   macro avg       0.89      0.89      0.89      4500
weighted avg       0.89      0.89      0.89      4500



In [0]:
# Get X_test from dataset movie_review_evaluation.csv
dftest = pd.read_csv('/content/gdrive/My Drive/FTMLE - Tonga/Data/movie_review_evaluation.csv', encoding='utf-8', sep='\t')

In [0]:
pre_X_test = dftest['review']

In [132]:
pre_predictions = clf.predict(pre_X_test)
# Accuracy same above
accuracy_score(y_test, predictions)

0.8904444444444445

In [0]:
# Add result predict to column predictions and save dataset to thi.csv
predictions_values = pre_predictions.tolist()
dftest['predictions'] = predictions_values
dftest.to_csv('thi.csv', index = False)

**Orther Predict By Score in ID**

In [167]:
df

Unnamed: 0,id,review,sentiment
0,5814_8,With all this stuff going down at the moment w...,1
1,2381_9,"\The Classic War of the Worlds\"" by Timothy Hi...",1
2,7759_3,The film starts with a manager (Nicholas Bell)...,0
3,3630_4,It must be assumed that those who praised this...,0
4,9495_8,Superbly trashy and wondrously unpretentious 8...,1
...,...,...,...
22495,3453_3,It seems like more consideration has gone into...,0
22496,5064_1,I don't believe they made this film. Completel...,0
22497,10905_3,"Guy is a loser. Can't get girls, needs to buil...",0
22498,10194_3,This 30 minute documentary BuÃ±uel made in the...,0


As we can see, the last number in Id column same as the score, I 
decided to handle it and predict by the score

In [0]:
def getscore(text):
  return text.split('_')[1]

df['score'] = df['id'].apply(getscore)
df['score'] = df['score'].astype('int64')

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
X_sc = df[['score']].to_numpy()
y_sc = df['sentiment'].to_numpy().reshape((-1, 1))

X_sc_train, X_sc_test, y_sc_train, y_sc_test = train_test_split(X_sc, y_sc , test_size = 0.2)

In [142]:
from sklearn.linear_model import LogisticRegression
test = LogisticRegression()
test.fit(X_sc_train,y_sc_train)
prediction = test.predict(X_sc_test)
test.score( X_sc_test,y_sc_test)

  y = column_or_1d(y, warn=True)


1.0