In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [2]:
np.random.seed(500)

In [3]:
Corpus = pd.read_csv("mydata.csv",encoding='latin-1')

In [4]:
print(Corpus.head())

                                                text  label
0                               i feel troubled over    sad
1  i still cant shake the feeling that i might be...    sad
2  i now feel as if im doomed to fail my upcoming...    sad
3  i hate the feeling that i can t do anything us...    joy
4  After #CultureGeek yesterday, I have to recomm...  happy


In [5]:
Corpus['text'].dropna(inplace=True)

In [6]:
Corpus['text'] = [entry.lower() for entry in Corpus['text']]

In [7]:
Corpus['text']= [word_tokenize(entry) for entry in Corpus['text']]

In [8]:
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

In [9]:
for index,entry in enumerate(Corpus['text']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus.loc[index,'text_final'] = str(Final_words)

In [10]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['label'],test_size=0.3)

In [11]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [12]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [13]:
print(Tfidf_vect.vocabulary_)



In [14]:
print(Train_X_Tfidf)

  (0, 3698)	0.6947092995708669
  (0, 3362)	0.48742509151184926
  (0, 1619)	0.12113239780591566
  (0, 634)	0.5149006811572601
  (1, 3281)	0.5969579411463479
  (1, 3018)	0.43526478305802246
  (1, 2602)	0.3436792704101388
  (1, 448)	0.3427449648663282
  (1, 133)	0.4675427609116628
  (2, 4815)	0.20179728750200202
  (2, 4788)	0.3440328752393306
  (2, 3850)	0.3881655975652131
  (2, 3514)	0.1972402863133105
  (2, 3286)	0.28919842768556486
  (2, 1825)	0.1887745348148807
  (2, 1798)	0.18272402500527155
  (2, 1691)	0.37251017559489247
  (2, 1619)	0.05943715801770526
  (2, 1360)	0.33888269564587675
  (2, 1279)	0.2313238741169983
  (2, 343)	0.2360810401433826
  (2, 197)	0.3637310388376553
  (3, 4492)	0.6266022302533965
  (3, 2145)	0.3021633453220806
  (3, 2049)	0.6419319795891026
  :	:
  (11665, 3723)	0.21943443013098735
  (11665, 3650)	0.2421494886850854
  (11665, 3504)	0.1790559698954363
  (11665, 3447)	0.181170955468126
  (11665, 3359)	0.21030074044075253
  (11665, 2441)	0.17299589800210632
  (

In [15]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  87.5624875024995


In [16]:
import joblib

In [17]:
joblib.dump(SVM,'fypmodel.sav')

['fypmodel.sav']

In [None]:
def emotionfun(sentence): 
    tokenizedSentence = word_tokenize(sentence)
    Tfidf_vect = TfidfVectorizer(max_features=5000,lowercase=True,stop_words='english')
    Tfidf_vect.fit(tokenizedSentence)
    return SVM.predict(Tfidf_vect)


In [None]:
print(emotionfun("I didnt feel humiliated"))