In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abram\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abram\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
pd.set_option('display.max_colwidth', 2)
df = pd.read_csv("Emotion_classify_Data.csv")
df

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now i feel reluctant to drop it,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feelings and i think that i am afraid to accept the possibility that he might not make it,fear
3,ive been really angry with r and i feel like an idiot for trusting him in the first place,joy
4,i feel suspicious if there is no one outside like the rapture has happened or something,fear
...,...,...
5932,i begun to feel distressed for you,fear
5933,i left feeling annoyed and angry thinking that i was the center of some stupid joke,anger
5934,i were to ever get married i d have everything ready to offer to him because i ve got it together and when i do go out to clubs even the perfect good looking guys feel intimated after talking to me about my clever self,joy
5935,i feel reluctant in applying there because i want to be able to find a company where i know at least one person,fear


In [3]:
stop_words = nltk.corpus.stopwords.words('english')
additional_stop_words  = ['im', 'iv', 'ive']
stop_words.extend(additional_stop_words)

In [4]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

def text_tokenize(text):
    text_array = tokenizer.tokenize(text) 
    text_array = [i for i in text_array if i not in stop_words and len(i)!=1]
    return text_array

In [5]:
df["Comment"] = df["Comment"].apply(lambda x: text_tokenize(x))

In [6]:
df["Comment"]

0       [seriously, hate, one, subject, death, feel, reluctant, drop]                                                                                       
1       [full, life, feel, appalled]                                                                                                                        
2       [sit, write, start, dig, feelings, think, afraid, accept, possibility, might, make]                                                                 
3       [really, angry, feel, like, idiot, trusting, first, place]                                                                                          
4       [feel, suspicious, one, outside, like, rapture, happened, something]                                                                                
                                        ...                                                                                                                 
5932    [begun, feel, distressed]                         

In [7]:
lemmatizer = nltk.stem.WordNetLemmatizer()

def text_lemmatizer(text_array):
    text_array = [lemmatizer.lemmatize(word) for word in text_array]
    return  text_array

In [8]:
df["Comment"] = df["Comment"].apply(lambda x: text_lemmatizer(x))

In [9]:
df["Comment"]

0       [seriously, hate, one, subject, death, feel, reluctant, drop]                                                                                     
1       [full, life, feel, appalled]                                                                                                                      
2       [sit, write, start, dig, feeling, think, afraid, accept, possibility, might, make]                                                                
3       [really, angry, feel, like, idiot, trusting, first, place]                                                                                        
4       [feel, suspicious, one, outside, like, rapture, happened, something]                                                                              
                                        ...                                                                                                               
5932    [begun, feel, distressed]                                     

In [10]:
def to_string(arr):
    str = ""
    for word in arr:
        str = str + word + " "
    return str

In [11]:
df["Comment"] = df["Comment"].apply(lambda x: to_string(x))

In [12]:
df["Comment"]

0       seriously hate one subject death feel reluctant drop                                                                          
1       full life feel appalled                                                                                                       
2       sit write start dig feeling think afraid accept possibility might make                                                        
3       really angry feel like idiot trusting first place                                                                             
4       feel suspicious one outside like rapture happened something                                                                   
                                    ...                                                                                               
5932    begun feel distressed                                                                                                         
5933    left feeling annoyed angry thinking center stup

In [13]:
type(df["Comment"][0])

str

In [14]:
def numeraize(text):
    vectorizer = CountVectorizer()
    text = [text]
    array = vectorizer.fit_transform(text)
    return array.toarray()

In [15]:
df["Comment"] = df["Comment"].apply(lambda x: numeraize(x))

In [16]:
df["Comment"]

0       [[1, 1, 1, 1, 1, 1, 1, 1]]                                    
1       [[1, 1, 1, 1]]                                                
2       [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]                           
3       [[1, 1, 1, 1, 1, 1, 1, 1]]                                    
4       [[1, 1, 1, 1, 1, 1, 1, 1]]                                    
                   ...                                                
5932    [[1, 1, 1]]                                                   
5933    [[1, 1, 1, 1, 1, 1, 1, 1]]                                    
5934    [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
5935    [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]                           
5936    [[1, 1, 1, 1, 1, 1]]                                          
Name: Comment, Length: 5937, dtype: object

In [17]:
emotions = df["Emotion"].unique()

In [18]:
def emotion_numeraize(emotion):
    for i in range(len(emotions)):
        if(emotions[i] == emotion):
            emotion =  i
    return emotion

In [19]:
df["Emotion"] = df["Emotion"].apply(lambda x: emotion_numeraize(x))

In [20]:
df["Emotion"]

0       0
1       1
2       0
3       2
4       0
       ..
5932    0
5933    1
5934    2
5935    0
5936    1
Name: Emotion, Length: 5937, dtype: int64

In [21]:
y = df["Emotion"]
x = df["Comment"]
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 42)

In [22]:
x_train

4945    [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]                                          
5428    [[1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1]]                                 
1344    [[1, 1, 1, 1, 1, 1, 1]]                                                   
1888    [[1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1]]                                 
2480    [[1, 1, 1, 1]]                                                            
             ...                                                                  
3772    [[1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1]]                     
5191    [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]                                          
5226    [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1]]
5390    [[1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]            
860     [[1, 1, 1, 1]]                                                            
Name: Comment, Length: 4749, dtype: object

In [23]:
model = SVC()
model.fit(x_train, y_train)
print("train is sucess!")

ValueError: setting an array element with a sequence.