In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


### import data

In [2]:
tweet = pd.read_json("tweets_DM.json",lines=True)
data_identification = pd.read_csv("data_identification.csv")
emotion = pd.read_csv("emotion.csv")

### clean data

In [3]:
tweet["_crawldate"] = pd.to_datetime(tweet["_crawldate"])
tweet["id"] = tweet["_source"].apply(lambda x : x["tweet"]["tweet_id"])
tweet["text"] = tweet["_source"].map(lambda x : x["tweet"]["text"])
tweet_clean = pd.merge(tweet,data_identification,how = "left",left_on = "id",right_on = "tweet_id").drop(columns=["tweet_id","_type","_index"])
tweet_clean=pd.merge(tweet_clean,emotion,how="left",left_on = "id",right_on = "tweet_id").drop(columns="tweet_id")
tweet_clean.sort_values(by=["identification","_crawldate"],ascending=[False,True],inplace=True)
tweet_clean

Unnamed: 0,_score,_source,_crawldate,id,text,identification,emotion
1393060,887,"{'tweet': {'hashtags': ['foodporn', 'sohungry'...",2015-01-01 01:00:36,0x25abba,WTF!! I'm so damn hungry right now and all the...,train,disgust
1018667,1018,"{'tweet': {'hashtags': [], 'tweet_id': '0x37be...",2015-01-01 01:01:27,0x37be92,<LH> Birthday Siva sir,train,joy
381192,720,"{'tweet': {'hashtags': ['IdiotInChief', 'Covfe...",2015-01-01 01:02:31,0x21d8cf,@JaydaBF @realDonaldTrump You have just retwee...,train,sadness
17974,755,"{'tweet': {'hashtags': [], 'tweet_id': '0x2de1...",2015-01-01 01:03:36,0x2de153,Watching <LH> at work was definitely a bad ide...,train,fear
116932,927,"{'tweet': {'hashtags': ['askingalexandria', 'm...",2015-01-01 01:04:31,0x33e33b,Can’t wait for the new @AAofficial album to dr...,train,anticipation
...,...,...,...,...,...,...,...
1071053,548,"{'tweet': {'hashtags': [], 'tweet_id': '0x34b8...",2017-12-28 23:38:45,0x34b8c5,I hate to admit it but I laughed out loud to t...,test,
1044368,234,"{'tweet': {'hashtags': [], 'tweet_id': '0x30a9...",2017-12-28 23:43:23,0x30a9a6,@mickerbod I'm privileged to watch this endles...,test,
419510,376,"{'tweet': {'hashtags': [], 'tweet_id': '0x2a66...",2017-12-28 23:49:09,0x2a66c9,i believed! one day you will come into my life...,test,
1759896,21,"{'tweet': {'hashtags': [], 'tweet_id': '0x1f23...",2017-12-28 23:50:47,0x1f232b,"UPDATE: hitched a ride on Blablacar, not only ...",test,


### preprocessing
When we face classification problem, we need to choose a proper model.  
用用看word2vec以取平均值代表句子 丟入KNN模型中看結果  


In [26]:
# 切分句子，以方便放入word2vec
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import string

stop_words = set(stopwords.words('english'))

temp = tweet_clean["text"].apply(lambda x:word_tokenize(x))
tweet_clean["word_split"] = temp.apply(lambda words: [word for word in words if word not in string.punctuation and word.lower() not in stop_words])
tweet_clean["word_split"]


1393060    [WTF, 'm, damn, hungry, right, food, porn, pic...
1018667                            [LH, Birthday, Siva, sir]
381192     [JaydaBF, realDonaldTrump, retweeted, fakenews...
17974      [Watching, LH, work, definitely, bad, idea, Go...
116932     [’, wait, new, AAofficial, album, drop, Decemb...
                                 ...                        
1071053    [hate, admit, laughed, loud, new, LibDems, PPB...
1044368    [mickerbod, 'm, privileged, watch, endless, cr...
419510         [believed, one, day, come, life, missing, LH]
1759896    [UPDATE, hitched, ride, Blablacar, still, aliv...
572949     [Happiest, Birthday, PandaNgKL_02, Smile, like...
Name: word_split, Length: 1867535, dtype: object

In [27]:
from gensim.models import Word2Vec

## setting
vector_dim = 100        # 含義：詞向量的維度。對每個單詞，模型會生成一個固定長度的向量（如 [0.1, -0.2, 0.3, ...]）。vector_size 決定了這些向量的長度
window_size = 5         # 上下文窗口的大小，指模型在訓練時，考慮每個詞的周圍多少個詞來捕捉語義。
min_count = 1           # 忽略語料中出現頻率低於 min_count 的單詞
training_epochs = 10    # 每次訓練，模型會將語料完整地掃描一遍，這稱為「一個 epoch」。在一次 epoch 中，所有的句子或文檔都被用來更新模型的參數。2個epochs可以理解成模型訓練過一次了，基於訓練後的模型再訓練一次，有點像deep learning


## model
word2vec_model = Word2Vec(sentences=tweet_clean["word_split"], 
                          vector_size=vector_dim, window=window_size, 
                          min_count=min_count, epochs=training_epochs)

In [29]:
word_vec = word2vec_model.wv['happy']  # 從模型中找到單詞的向量，單一詞的向量根據這次的設定有100個
word_vec

array([ 1.0946575 ,  3.3085358 , -5.36146   ,  4.351481  ,  1.0155076 ,
        0.7796804 , -1.3419063 ,  1.7378699 , -0.63494277, -4.038788  ,
       -2.7730186 , -4.110383  , -0.25885323, -1.7599722 ,  0.52316946,
        5.128932  , -0.12225738,  0.7557715 , -0.18295553, -0.5610518 ,
       -3.3123155 ,  0.729856  , -5.879936  , -0.33385688, -3.2566755 ,
        2.2281423 , -1.3858492 , -0.16583024,  1.4787598 , -2.240668  ,
       -0.3491676 , -0.5017754 ,  2.7864811 ,  0.11716035, -0.88663936,
        2.5699415 ,  0.390959  , -3.0171309 , -1.350871  , -1.1744848 ,
       -0.65655744,  2.301452  ,  1.063616  ,  2.5288384 , -1.7253608 ,
        0.15904929,  2.6951861 ,  1.4969612 ,  2.2692904 , -2.1833582 ,
        2.1161141 , -2.3010957 ,  4.3996506 ,  0.5901914 ,  0.34676966,
        1.8320973 , -2.4640915 ,  0.78306043,  1.7089967 ,  0.25929013,
       -0.16138333, -0.6551901 , -0.24498315,  1.3426391 , -3.0515363 ,
       -1.206753  , -0.11686386,  2.4100344 ,  0.1191423 ,  0.18

In [37]:
def wordvec_to_sentence(x):
    word_vec=np.zeros((len(x),100))
    for i,j in enumerate(x):
        word_vec[i,:] = word2vec_model.wv[j]
    t = word_vec.sum(axis=0)
    return t

In [40]:
tweet_clean["sentence_vec"] = tweet_clean["word_split"].apply(lambda x: wordvec_to_sentence(x))
tweet_clean["sentence_vec"]

1393060    [-8.74359582317993, 30.33151352405548, -16.739...
1018667    [2.2336567640304565, -0.34930962324142456, -6....
381192     [-2.173203855752945, 1.2440386563539505, 3.002...
17974      [-9.186324305832386, 17.120205894112587, -21.2...
116932     [-5.844593670219183, 7.938859216868877, 5.8484...
                                 ...                        
1071053    [-6.257771387696266, 11.112639918923378, -9.14...
1044368    [-1.3312410678481683, 9.10064145270735, 7.7146...
419510     [-3.224938243627548, 7.58607292175293, -8.9211...
1759896    [-12.339123457670212, 12.176935940980911, -5.6...
572949     [-9.941006481647491, 9.925282841548324, -17.54...
Name: sentence_vec, Length: 1867535, dtype: object

In [43]:
train_tweet = tweet_clean[tweet_clean["identification"] == "train"]
test_tweet = tweet_clean[tweet_clean["identification"] == "test"]

In [48]:
# Initialize the KNN classifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

X_train = train_tweet["sentence_vec"].to_list()
y_train = train_tweet["emotion"]

knn = KNeighborsClassifier(n_neighbors=10)

knn.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report

x_test = test_tweet["sentence_vec"].to_list()
y_test = test_tweet["emotion"]

y_pred = knn.predict(x_test)
y_pred.to_list


ValueError: Classification metrics can't handle a mix of unknown and multiclass targets

In [61]:
y_pred.tolist()
test_tweet
output = pd.DataFrame({"id":test_tweet["id"],"emotion":y_pred.tolist()})
output

In [68]:
output.reset_index(drop=True).set_index("id").to_csv("output1.csv")
