In [2]:
import pandas as pd # for reading csv file
import re # for preprocessing text
import string # for preprocessing text
from sklearn.feature_extraction.text import CountVectorizer # to create Bag of words
from sklearn.model_selection import train_test_split  # for splitting data
from sklearn.naive_bayes import GaussianNB # to bulid classifier model
from sklearn.preprocessing import LabelEncoder # to convert classes to number 
from sklearn.metrics import accuracy_score # to calculate accuracy
import nltk # for processing texts
from nltk.corpus import stopwords # list of stop words
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/samaryaseen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/samaryaseen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/samaryaseen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
data = pd.read_csv('agecategory.csv',engine='python')

In [4]:
data.shape

(4930, 2)

In [5]:
data.sentiment.value_counts()

above 18     2465
under age    2465
Name: sentiment, dtype: int64

In [6]:
data.isna().sum()

review       0
sentiment    0
dtype: int64

In [7]:
data['review'][20]

'Like those who listened to radio reports about the attack on Pearl Harbor, every one who has ever seen PINK FLAMINGOS can tell you exactly where they were when they first saw it--and some thirty years later the movie is still one of the most unspeakably vile, obnoxious, repulsive, and hilariously funny films ever put to celluloid, guaranteed to test the strongest stomachs and the toughest funny bones.<br /><br />Filmed with a close-to-zero budget and some of the shakiest cinematography around, PINK FLAMINGOS tells the story of two families that compete for the tabloid title of "The Filthiest People Alive." Just how filthy can they be? Plenty: the film includes everything from sex with chickens to what I can only describe as a remarkable display of rectal control to a heaping helping of doggie doo, and I guarantee that you won\'t want to eat an egg for at least several weeks after seeing it.<br /><br />The cast is either wonderful, atrocious, or atrociously wonderful, depending on how 

In [8]:
def clean_text(text):
  '''
  DESCRIPTION:
  This function to clean text 
  INPUT: 
  text: string
  OUTPUT: 
  text: string after clean it
  ''' 
  text = text.lower() # convert letters to lower case
  text = re.sub("[^a-zA-Z]", " ", text) # remove non-letters
  text = re.sub(r'\d+', '', text) # remove number
  text = re.sub(r'http\S+', '', text) # remove links
  text = text.translate(str.maketrans('','', string.punctuation)) # remove punctuation
  text = re.sub(' +', ' ',text) # remove extra space
  text = text.strip() # remove whitespaces

  text = ' '.join([word for word in text.split() if word not in stopwords.words("english")]) # remove stop words
  lemma = nltk.WordNetLemmatizer() # define lemmatizer
  text = ' '.join([lemma.lemmatize(word) for word in text.split()]) 
  return text


In [9]:
# The cleaning function applied in all reviews
data['review'] = data['review'].apply(clean_text)

In [10]:
max_features = 1500
count_vector = CountVectorizer(max_features = max_features)  
X = count_vector.fit_transform(data['review']).toarray() 
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [11]:
print("most using {} words: {} ".format(max_features, count_vector.get_feature_names()))


most using 1500 words: ['ability', 'able', 'absolutely', 'academy', 'accent', 'accept', 'accident', 'across', 'act', 'acted', 'acting', 'action', 'actor', 'actress', 'actual', 'actually', 'adam', 'adaptation', 'add', 'added', 'addition', 'admit', 'adult', 'adventure', 'affair', 'age', 'agent', 'ago', 'agree', 'ahead', 'air', 'alice', 'alien', 'alive', 'allowed', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'amazing', 'america', 'american', 'among', 'amount', 'amusing', 'andy', 'animal', 'animated', 'animation', 'anime', 'ann', 'anna', 'anne', 'annoying', 'another', 'answer', 'anti', 'anyone', 'anything', 'anyway', 'apart', 'apartment', 'apparently', 'appeal', 'appear', 'appearance', 'appears', 'appreciate', 'approach', 'area', 'army', 'around', 'art', 'arthur', 'artist', 'artistic', 'aside', 'ask', 'asked', 'aspect', 'atmosphere', 'attack', 'attempt', 'attention', 'attitude', 'audience', 'available', 'average', 'award', 'away', 'awesome', 'baby', 'back', 'backgr

In [12]:
print(count_vector.vocabulary_)

{'one': 931, 'reviewer': 1089, 'mentioned': 839, 'watching': 1439, 'episode': 420, 'right': 1094, 'exactly': 436, 'happened': 600, 'br': 150, 'first': 497, 'thing': 1330, 'scene': 1124, 'violence': 1419, 'set': 1161, 'word': 1475, 'go': 567, 'show': 1180, 'pull': 1034, 'drug': 381, 'sex': 1165, 'classic': 232, 'use': 1401, 'called': 175, 'given': 564, 'state': 1244, 'focus': 506, 'mainly': 803, 'city': 229, 'prison': 1018, 'front': 534, 'face': 453, 'high': 619, 'home': 632, 'many': 812, 'italian': 690, 'death': 327, 'dealing': 326, 'never': 900, 'far': 466, 'away': 92, 'would': 1484, 'say': 1119, 'main': 802, 'appeal': 66, 'due': 382, 'fact': 454, 'forget': 517, 'pretty': 1016, 'picture': 978, 'audience': 88, 'charm': 210, 'romance': 1104, 'around': 74, 'ever': 429, 'saw': 1118, 'surreal': 1292, 'watched': 1438, 'developed': 347, 'taste': 1306, 'got': 575, 'level': 761, 'graphic': 580, 'kill': 719, 'order': 937, 'get': 557, 'well': 1446, 'middle': 844, 'class': 231, 'turned': 1383, 'l

In [13]:
d = pd.DataFrame(X,columns=count_vector.get_feature_names())
d

Unnamed: 0,ability,able,absolutely,academy,accent,accept,accident,across,act,acted,...,wrong,wrote,year,yes,yet,york,young,younger,youth,zombie
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,1,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4925,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,3,0,2,0,0,0
4926,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4927,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4928,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [14]:
# convert classes to number
y = encoder = LabelEncoder()
y = encoder.fit_transform(data['sentiment'])
y

array([0, 0, 0, ..., 0, 0, 0])

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.3)


In [16]:
print(X_test)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [17]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

clf = SVC()
grid =  {'C':[0.5,1,1.5,5,5.5,10,10.5],
     'gamma':[0.000001, 0.0001, 0.001],
     'kernel':['rbf', 'poly']}
model = GridSearchCV(clf, grid,cv=2)

In [18]:
# train model
model.fit(X_train, y_train)

GridSearchCV(cv=2, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.5, 1, 1.5, 5, 5.5, 10, 10.5],
                         'gamma': [1e-06, 0.0001, 0.001],
                         'kernel': ['rbf', 'poly']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [19]:
# Predicting the Test set results 

y_pred=model.best_estimator_.predict(X_test)
y_pred

array([1, 0, 1, ..., 1, 1, 1])

In [20]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

#print('Train model accuracy: ',accuracy_score(y_train, y_pred))
print('Test model accuracy: ',accuracy_score(y_test, y_pred))
print('Test model Precision',precision_score(y_test, y_pred))
print('Test model Recall: ',recall_score(y_test, y_pred))
print('Test model F1: ',f1_score(y_test, y_pred))

Test model accuracy:  0.9127789046653144
Test model Precision 0.8935897435897436
Test model Recall:  0.9380888290713324
Test model F1:  0.9152987524622456


In [21]:
# input1 statment

#You can see blood in every scene
#This movie full of pornography 
#The brother killed his sister
#It’s fulll of drugs, alcohol and wine
#It’s a horror movie
#There’s a scary ghost 
#Nudity scenes are in this movie
#You can see blood in every scene

test_review1 = ['Nudity scenes are in this movie'] 

In [22]:
# convert to number
test_vector = count_vector.transform(test_review1)
print(test_vector)
test_vector = test_vector.toarray()

  (0, 875)	1
  (0, 916)	1


In [23]:
## encodeing predict class

text_predict_class = encoder.inverse_transform(model.predict(test_vector))
print(test_review1[0], 'is: ',text_predict_class[0])

Nudity scenes are in this movie is:  above 18


In [24]:
# input2 statment
#All my kids liked this movie
#This is the best cartoon movie
#This is an animation scenes
test_review2 = ['This is the best cartoon movie'] 

In [25]:
# convert to number
test_vector = count_vector.transform(test_review2)
test_vector = test_vector.toarray()

In [26]:
## encodeing predict class
text_predict_class = encoder.inverse_transform(model.predict(test_vector))
print(test_review2[0], 'is: ',text_predict_class[0])

This is the best cartoon movie is:  under age
