# NLP Basic Assignment
## 과제 : spam.csv를 활용하여 유의미한 해석을 도출해주세요!

In [1]:
import pandas as pd

## Load Data
- 보시면 아시다시피 spam.csv는 라벨이 있는 데이터입니다.
- 7주차 주제가 텍스트 기초인만큼 텍스트만 활용하셔도 되고 라벨까지 활용하셔서 모델을 돌려보셔도 좋습니다.

In [35]:
spam = pd.read_csv('spam.csv')

In [36]:
spam.iloc[5]['v2']

"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv"

In [37]:
spam

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


## Tokenizing


In [38]:
import nltk

In [39]:
# 예시 코드

from nltk.tokenize import word_tokenize

nltk.download('punkt')
word_tokenize(spam.iloc[5]['v2'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mihye\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['FreeMsg',
 'Hey',
 'there',
 'darling',
 'it',
 "'s",
 'been',
 '3',
 'week',
 "'s",
 'now',
 'and',
 'no',
 'word',
 'back',
 '!',
 'I',
 "'d",
 'like',
 'some',
 'fun',
 'you',
 'up',
 'for',
 'it',
 'still',
 '?',
 'Tb',
 'ok',
 '!',
 'XxX',
 'std',
 'chgs',
 'to',
 'send',
 ',',
 'å£1.50',
 'to',
 'rcv']

In [40]:
# 토큰화

token = []

for sentence in spam.v2 : 
    words = word_tokenize(sentence) 
    token.append(words)
    
spam.v2 = token

In [41]:
spam

Unnamed: 0,v1,v2
0,ham,"[Go, until, jurong, point, ,, crazy, .., Avail..."
1,ham,"[Ok, lar, ..., Joking, wif, u, oni, ...]"
2,spam,"[Free, entry, in, 2, a, wkly, comp, to, win, F..."
3,ham,"[U, dun, say, so, early, hor, ..., U, c, alrea..."
4,ham,"[Nah, I, do, n't, think, he, goes, to, usf, ,,..."
...,...,...
5567,spam,"[This, is, the, 2nd, time, we, have, tried, 2,..."
5568,ham,"[Will, Ì_, b, going, to, esplanade, fr, home, ?]"
5569,ham,"[Pity, ,, *, was, in, mood, for, that, ., So, ..."
5570,ham,"[The, guy, did, some, bitching, but, I, acted,..."


In [42]:
# 불용어 제거

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english')) 
        
token = []

for words in spam.v2 : 
    words = [word for word in words if word not in stop_words]
    token.append(words)
    
spam.v2 = token

In [43]:
spam

Unnamed: 0,v1,v2
0,ham,"[Go, jurong, point, ,, crazy, .., Available, b..."
1,ham,"[Ok, lar, ..., Joking, wif, u, oni, ...]"
2,spam,"[Free, entry, 2, wkly, comp, win, FA, Cup, fin..."
3,ham,"[U, dun, say, early, hor, ..., U, c, already, ..."
4,ham,"[Nah, I, n't, think, goes, usf, ,, lives, arou..."
...,...,...
5567,spam,"[This, 2nd, time, tried, 2, contact, u., U, å£..."
5568,ham,"[Will, Ì_, b, going, esplanade, fr, home, ?]"
5569,ham,"[Pity, ,, *, mood, ., So, ..., suggestions, ?]"
5570,ham,"[The, guy, bitching, I, acted, like, 'd, inter..."


## Embedding

- 수업에서 다룬 임베딩 방법에는 One-hot encoding, CBOW, Skip-gram 등이 있었습니다. 다양한 시도와 '비교' 결과를 함께 적어주세요! 파라미터를 조정해가는 과정도 해석에 도움이 될 수 있겠죠 :)

In [98]:
spam.v1 = spam.v1.replace(['ham','spam'],[0,1])

In [99]:
corpus_ham = sum(spam['v2'][spam.v1 == 0],[])
corpus_spam = sum(spam['v2'][spam.v1 == 1],[])

### ONE-HOT ENCODING

In [100]:
from sklearn.feature_extraction.text import CountVectorizer

model1=CountVectorizer()

model1_ham=model1.fit_transform(corpus_ham)

count_ham = pd.DataFrame( {'word' : model1.get_feature_names_out(),
                           'count' : model1_ham.sum(axis = 0).flat})
count_ham.sort_values('count', ascending = False).head(10) 


Unnamed: 0,word,count
2586,gt,318
3547,lt,316
2451,get,305
4153,ok,287
3466,ll,262
2491,go,249
2528,got,244
6774,you,244
6308,ur,241
993,call,236


In [101]:
model1_spam=model1.fit_transform(corpus_spam)

count_spam = pd.DataFrame( {'word' : model1.get_feature_names_out(),
                           'count' : model1_spam.sum(axis = 0).flat})
count_spam.sort_values('count', ascending = False).head(10) 

Unnamed: 0,word,count
957,call,355
1358,free,224
2594,txt,163
2639,ur,144
1804,mobile,127
2493,text,125
2402,stop,121
1034,claim,113
2817,you,108
2184,reply,104


### CBOW

In [115]:
from gensim.models.word2vec import Word2Vec

#학습
model2_ham=Word2Vec(spam['v2'][spam.v1 == 0], sg=0)
#모델 저장
model2_ham.save('model2_ham')
#모델 불러오기
model2_ham = Word2Vec.load('model2_ham')


In [116]:
model2_spam=Word2Vec(spam['v2'][spam.v1 == 1], sg=0)
model2_spam.save('model2_spam')
model2_spam = Word2Vec.load('model2_spam')


### SKIP-GRAM

In [123]:
model3_ham=Word2Vec(spam['v2'][spam.v1 == 0], sg=1)
model3_ham.save('model3_ham')
model3_ham = Word2Vec.load('model3_ham')


In [118]:
model3_spam=Word2Vec(spam['v2'][spam.v1 == 1], sg=1)
model3_spam.save('model3_spam')
model3_spam = Word2Vec.load('model3_spam')


## 본인이 도출해낸 해석을 적어주세요!

- 유사도, Wordcloud, 이진 분류 모델, Plot 뭐든 상관없으니 분명하고 인상적인 해석을 적어주시면 됩니다.

### ham에 대한 유사도

In [124]:
print('cbow')
pd.DataFrame(model2_ham.wv.most_similar('free'))

cbow


Unnamed: 0,0,1
0,If,0.999436
1,Can,0.999415
2,said,0.999406
3,got,0.999373
4,told,0.999369
5,ok,0.999366
6,find,0.999358
7,Will,0.999357
8,take,0.999353
9,cos,0.999351


In [125]:
print('skip-gram')
pd.DataFrame(model3_ham.wv.most_similar('free'))

skip-gram


Unnamed: 0,0,1
0,Can,0.987023
1,ill,0.986787
2,wont,0.985125
3,Call,0.984471
4,check,0.983483
5,At,0.982773
6,bring,0.982681
7,Carlos,0.982641
8,calls,0.982393
9,reach,0.982167


### spam에 대한 유사도

In [126]:
print('cbow')
pd.DataFrame(model2_spam.wv.most_similar('call'))

cbow


Unnamed: 0,0,1
0,!,0.999755
1,",",0.999747
2,.,0.999742
3,?,0.99972
4,2,0.999681
5,ur,0.999666
6,Call,0.999655
7,-,0.999641
8,mobile,0.999629
9,FREE,0.999623


In [127]:
print('skip-gram')
pd.DataFrame(model3_spam.wv.most_similar('call'))

skip-gram


Unnamed: 0,0,1
0,please,0.997222
1,easy,0.996716
2,08707509020,0.996678
3,matches,0.996392
4,BT-national-rate,0.996278
5,match,0.996146
6,delivery,0.996074
7,Delivery,0.995999
8,LIVE,0.995986
9,yr,0.995972
