<a href="https://colab.research.google.com/github/lucascoelhoo/master-classifier-api/blob/main/ArtigoMetaSearchMethod_ICSBT2022.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Dataset: Polarity dataset v2.0
# http://www.cs.cornell.edu/people/pabo/movie-review-data/
#
# Discussion at https://medium.com/@vasista/sentiment-analysis-textblob-vs-svm-338d418e3ff1

from sklearn.feature_extraction.text import TfidfVectorizer
import time
from sklearn import svm
from sklearn.metrics import classification_report
import pandas as pd

# train Data
trainData = pd.read_csv("https://raw.githubusercontent.com/Vasistareddy/sentiment_analysis/master/data/train.csv")
print(trainData.iloc[:10])
print("\n\n")
# test Data
testData = pd.read_csv("https://raw.githubusercontent.com/Vasistareddy/sentiment_analysis/master/data/test.csv")

# Create feature vectors
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)

train_vectors = vectorizer.fit_transform(trainData['Content'])
test_vectors = vectorizer.transform(testData['Content'])

print(train_vectors)
print("\n")



# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, trainData['Label'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Results for SVC(kernel=linear)")
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(testData['Label'], prediction_linear, output_dict=True)
print('positive: ', report['pos'])
print('negative: ', report['neg'])



                                             Content Label
0  every once in a while you see a film that is s...   pos
1  the love for family is one of the strongest dr...   pos
2  after the terminally bleak reservoir dogs and ...   pos
4  having not seen , " who framed roger rabbit " ...   pos
5  melvin van peebles' " sweet sweetback's baadas...   pos
6  edward zwick's " the siege " raises more quest...   pos
7  an energetic , visually stunning , but intelle...   pos
8  the keen wisdom of an elderly bank robber , th...   pos
9  is jimmy stewart the greatest actor of all-tim...   pos



  (0, 12442)	0.02702055838392124
  (0, 1128)	0.02571589890424715
  (0, 4118)	0.05099142562618731
  (0, 5847)	0.05037536781236324
  (0, 4138)	0.06715571036579193
  (0, 8958)	0.04936626445345458
  (0, 5246)	0.04732970629650998
  (0, 9680)	0.0426920338509739
  (0, 7331)	0.04978865745480442
  (0, 3603)	0.06715571036579193
  (0, 2758)	0.05381945880415767
  (0, 9350)	0.02724826521754145
  (0, 3136)	0.037197627

In [None]:
testData['Label']=prediction_linear
print(testData)

                                               Content Label
0    hedwig ( john cameron mitchell ) was born a bo...   pos
1    one of the more unusual and suggestively viole...   pos
2    what do you get when you combine clueless and ...   neg
3    >from the man who presented us with henry : th...   pos
4    tibet has entered the american consciousness s...   pos
..                                                 ...   ...
195  my inner flag was at half-mast last year when ...   neg
196  if anything , " stigmata " should be taken as ...   neg
197  woof ! too bad that leap of faith was the titl...   neg
198  the plot of big momma's house is martin lawren...   neg
199  in the year 2029 , captain leo davidson ( mark...   pos

[200 rows x 2 columns]


In [None]:
# train Data
trainData = pd.read_csv("https://raw.githubusercontent.com/Vasistareddy/sentiment_analysis/master/data/train.csv")
print(trainData.iloc[:10])

                                             Content Label
0  every once in a while you see a film that is s...   pos
1  the love for family is one of the strongest dr...   pos
2  after the terminally bleak reservoir dogs and ...   pos
4  having not seen , " who framed roger rabbit " ...   pos
5  melvin van peebles' " sweet sweetback's baadas...   pos
6  edward zwick's " the siege " raises more quest...   pos
7  an energetic , visually stunning , but intelle...   pos
8  the keen wisdom of an elderly bank robber , th...   pos
9  is jimmy stewart the greatest actor of all-tim...   pos


In [None]:
# Separating selected original data and inflated data dataframes
data = {"Content":[], "Label":[], "Enriched":[]}

df = pd.DataFrame(data)

selected = trainData[895:904][["Content","Label"]]
#inflated1 = trainData[:895][["Content","Label"]]
#inflated2 = trainData[905:][["Content","Label"]]

inflated1 = trainData[:100][["Content","Label"]]
inflated2 = trainData[1700:][["Content","Label"]]

inflated_frames = [inflated1, inflated2]

originalData = selected.copy()
inflatedData = pd.concat(inflated_frames).copy()
print(originalData)
print(inflatedData)

                                               Content Label
895  the muppet movie is the first , and the best m...   pos
896  so many students strive to get into schools su...   pos
897  there's something about ben stiller that makes...   pos
898  the most common ( and in many cases the only )...   pos
899  capsule : trippy , hyperspeed action machine f...   pos
900   " my name is jack carter , and you don't want...   neg
901  if you've been following william fichtner's ca...   neg
902  it's almost amusing to watch 21-year old chris...   neg
903  the word to describe sharon stone is " wonder ...   neg
                                                Content Label
0     every once in a while you see a film that is s...   pos
1     the love for family is one of the strongest dr...   pos
2     after the terminally bleak reservoir dogs and ...   pos
4     having not seen , " who framed roger rabbit " ...   pos
...                                                 ...   ...
1795   " holy man 

In [None]:
# Inflating selected original dataframe
originalData['Inflated']=""
for index_selected, row_selected in originalData.iterrows():
  frames=[]
  for index_inflated, row_inflated in inflatedData.iterrows():
    if row_inflated['Label'] == row_selected['Label']:
      frames.append(row_inflated['Content'])
  row_selected['Inflated']=frames

In [None]:
print(originalData)

                                               Content Label  \
895  the muppet movie is the first , and the best m...   pos   
896  so many students strive to get into schools su...   pos   
897  there's something about ben stiller that makes...   pos   
898  the most common ( and in many cases the only )...   pos   
899  capsule : trippy , hyperspeed action machine f...   pos   
900   " my name is jack carter , and you don't want...   neg   
901  if you've been following william fichtner's ca...   neg   
902  it's almost amusing to watch 21-year old chris...   neg   
903  the word to describe sharon stone is " wonder ...   neg   

                                              Inflated  
895  [every once in a while you see a film that is ...  
896  [every once in a while you see a film that is ...  
897  [every once in a while you see a film that is ...  
898  [every once in a while you see a film that is ...  
899  [every once in a while you see a film that is ...  
900  [ " nothing 

In [None]:
# Function to perform keyword search on enriched data
def findWord(keyword):
  for index, row in originalData.iterrows():
    if keyword in row['Content']:
      print(row)
      print("\n")
      continue
    for row_inflated in row['Inflated']:
      if keyword in row_inflated:
        print("***********Selected data:")
        print(row)
        print("***********Size of inflated data array:")
        print(len(row_inflated))
        print("***********Begin of Inflated data array:")
        print("{0:>5}".format(row_inflated))
        print("\n")
        break
      

In [None]:
findWord("worst of all")

***********Selected data:
Content      " my name is jack carter , and you don't want...
Label                                                     neg
Inflated    [ " nothing more than a high budget masturbati...
Name: 900, dtype: object
***********Size of inflated data array:
2765
***********Begin of Inflated data array:
plot : token director alan smithee steals the only copy of his film " trio " from the studio , after they complete the " final cut " without him . 
he threatens to burn the film reel if they do not allow him to keep his vision . 
critique : wow . 
i really can't remember the last time a movie sucked on so many levels ! 
the " comedy " in this film is pathetic , obvious and dated ( oj simpson jokes galore ) . 
the plot is uninteresting , boring and bad . 
the structure of the film is annoying , repetitive and pretentious . 
the acting is pretty bad , especially jackie chan , who can't act to save his life . 
the cameos are lame and seem forced , and the ending blows chu

In [None]:
findWord("disgust")

Content      " my name is jack carter , and you don't want...
Label                                                     neg
Inflated    [ " nothing more than a high budget masturbati...
Name: 900, dtype: object
***********Found in inflated data:
Content     if you've been following william fichtner's ca...
Label                                                     neg
Inflated    [ " nothing more than a high budget masturbati...
Name: 901, dtype: object
***********Found in inflated data:
Content     it's almost amusing to watch 21-year old chris...
Label                                                     neg
Inflated    [ " nothing more than a high budget masturbati...
Name: 902, dtype: object
***********Found in inflated data:
Content     the word to describe sharon stone is " wonder ...
Label                                                     neg
Inflated    [ " nothing more than a high budget masturbati...
Name: 903, dtype: object
***********Found in inflated data:


In [None]:
findWord("amazed")

Content     the muppet movie is the first , and the best m...
Label                                                     pos
Inflated    [every once in a while you see a film that is ...
Name: 895, dtype: object
***********Found in inflated data:
Content     so many students strive to get into schools su...
Label                                                     pos
Inflated    [every once in a while you see a film that is ...
Name: 896, dtype: object
***********Found in inflated data:
Content     there's something about ben stiller that makes...
Label                                                     pos
Inflated    [every once in a while you see a film that is ...
Name: 897, dtype: object
***********Found in inflated data:
Content     the most common ( and in many cases the only )...
Label                                                     pos
Inflated    [every once in a while you see a film that is ...
Name: 898, dtype: object
***********Found in inflated data:
Content     caps

In [None]:
findWord("watch again")

Content     the muppet movie is the first , and the best m...
Label                                                     pos
Inflated    [every once in a while you see a film that is ...
Name: 895, dtype: object
***********Found in inflated data:
Content     so many students strive to get into schools su...
Label                                                     pos
Inflated    [every once in a while you see a film that is ...
Name: 896, dtype: object
***********Found in inflated data:
Content     there's something about ben stiller that makes...
Label                                                     pos
Inflated    [every once in a while you see a film that is ...
Name: 897, dtype: object
***********Found in inflated data:
Content     the most common ( and in many cases the only )...
Label                                                     pos
Inflated    [every once in a while you see a film that is ...
Name: 898, dtype: object
***********Found in inflated data:
Content     caps

In [None]:
findWord("good")

Content     the muppet movie is the first , and the best m...
Label                                                     pos
Inflated    [every once in a while you see a film that is ...
Name: 895, dtype: object
***********Found in inflated data:
Content     so many students strive to get into schools su...
Label                                                     pos
Inflated    [every once in a while you see a film that is ...
Name: 896, dtype: object


Content     there's something about ben stiller that makes...
Label                                                     pos
Inflated    [every once in a while you see a film that is ...
Name: 897, dtype: object
***********Found in inflated data:
Content     the most common ( and in many cases the only )...
Label                                                     pos
Inflated    [every once in a while you see a film that is ...
Name: 898, dtype: object


Content     capsule : trippy , hyperspeed action machine f...
Label               