In [211]:
import pickle
import pandas as pd
import numpy as np
# replace with 20news to generate results for a different dataset
with open('data/20news/coarse/df.pkl', 'rb') as f:
    data = pickle.load(f)

In [212]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
  
lemmatizer = WordNetLemmatizer()
import re
def cleaning(sentence):
    stop_words = set(stopwords.words('english'))
    tokens = re.sub(r'[^\w\s]', '', sentence.lower()).replace("\n", " ").split(" ")
    cleaned = [token for token in tokens if token not in stop_words]
    return " ".join(cleaned)
data["sentence"]=data["sentence"].apply(cleaning)
data

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lorraine/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/lorraine/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/lorraine/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,sentence,label
0,wheres thing subject car nntppostinghost rac3...,rec
1,guy kuo subject si clock poll final call sum...,comp
2,thomas e willis subject pb questions organiza...,comp
3,jgreenamber joe green subject weitek p9000 or...,comp
4,jonathan mcdowell subject shuttle launch ques...,sci
...,...,...
18254,stupendous man subject temperature dark sky o...,sci
18255,jim smyton subject monitors kept 24 hours da...,comp
18256,subject game length braves update organizatio...,rec
18257,subject intel chmos 80868088 design kit news...,misc


In [213]:
def tfidf(word):
    sentence = data['sentence']
    idf = np.log(len(sentence)/sentence.str.contains(word).sum())
    result = []
    for i in range(len(sentence)):
        tf = sentence.iloc[i].count(word)/(len(sentence.iloc[i]))
        result.append(tf*idf)
    return result


In [214]:
import json
f = open('data/20news/coarse/seedwords.json')
seeds = json.load(f)
result = pd.DataFrame()
for key, value in seeds.items():
    df = pd.DataFrame()
    for w in value:
        df[w] = tfidf(w)
    result[key] = df.sum(axis = 1)
result


Unnamed: 0,alt,comp,misc,rec,sci,talk,soc
0,0.0,0.000000,0.000000,0.013655,0.000000,0.0,0.000000
1,0.0,0.000000,0.000000,0.003874,0.000000,0.0,0.000000
2,0.0,0.008086,0.000000,0.000000,0.000000,0.0,0.000000
3,0.0,0.006067,0.000000,0.002201,0.000000,0.0,0.000000
4,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...
18254,0.0,0.000000,0.000000,0.000000,0.004415,0.0,0.000000
18255,0.0,0.003376,0.000000,0.002008,0.000000,0.0,0.000000
18256,0.0,0.000000,0.000000,0.015876,0.000000,0.0,0.000000
18257,0.0,0.000000,0.003737,0.000000,0.000000,0.0,0.000000


In [215]:
data["prediction"] = result.idxmax(1)
data

Unnamed: 0,sentence,label,prediction
0,wheres thing subject car nntppostinghost rac3...,rec,rec
1,guy kuo subject si clock poll final call sum...,comp,rec
2,thomas e willis subject pb questions organiza...,comp,comp
3,jgreenamber joe green subject weitek p9000 or...,comp,comp
4,jonathan mcdowell subject shuttle launch ques...,sci,alt
...,...,...,...
18254,stupendous man subject temperature dark sky o...,sci,sci
18255,jim smyton subject monitors kept 24 hours da...,comp,comp
18256,subject game length braves update organizatio...,rec,rec
18257,subject intel chmos 80868088 design kit news...,misc,misc


In [216]:
# micro and macro F1 using tf-idf
from sklearn import metrics

In [217]:
metrics.f1_score(data["label"], data["prediction"], average="micro")

0.4772988663125034

In [218]:
metrics.f1_score(data["label"], data["prediction"], average="macro")

0.4931901047586327

In [301]:

import gensim
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
def preprocessing(sentence):
    tokens = sentence.split(" ")
    return [token for token in tokens if token!="" and token != " "]
features = data["sentence"].apply(preprocessing)
model = Word2Vec(sentences=features, size=100, window=5, min_count=1, workers=4)
#features

In [302]:
model.save("word2vec.model")
model = Word2Vec.load("word2vec.model")
model.train(features, total_examples=len(data), epochs=20)
#vector = model.wv["atheism"]
#vector

(51031570, 52124060)

In [303]:
def get_vectors_per_label(filename):
    f = open(filename)
    seeds = json.load(f)
    vector_per_label = []
    for key, value in seeds.items():
        lst = []
        for w in value:
            lst.append(model.wv[w])
        arr = np.asarray(lst)
        total = np.average(arr, axis=0)
        vector_per_label.append(total)
    return vector_per_label
vector_per_label = get_vectors_per_label('data/20news/coarse/seedwords.json')

In [304]:
def get_vector_per_doc(feature):
    vector_per_doc = []
    for feat in feature:
        lst = []
        for w in feat:
            lst.append(model.wv[w])
        arr = np.asarray(lst)
        total = np.average(arr, axis=0)
        vector_per_doc.append(total)
    return vector_per_doc
vector_per_doc = get_vector_per_doc(features)

In [305]:
len(vector_per_doc)

18259

In [306]:
f = open('data/20news/coarse/seedwords.json')
seeds = json.load(f)
from numpy.linalg import norm
def predict_word2vec(vector_per_doc, vector_per_label):
    predictions = []
    labels = list(seeds.keys())
    for doc in vector_per_doc:
        cosine = []
        for label in vector_per_label:
            cosine.append(np.dot(doc,label)/(norm(doc)*norm(label)))
        max_value = max(cosine)
        max_index = cosine.index(max_value)
        predictions.append(labels[max_index])
    return predictions   
prediction_word2vec = predict_word2vec(vector_per_doc, vector_per_label)

In [307]:
data["prediction_word2vec"] = prediction_word2vec
data

Unnamed: 0,sentence,label,prediction,prediction_word2vec
0,wheres thing subject car nntppostinghost rac3...,rec,rec,rec
1,guy kuo subject si clock poll final call sum...,comp,rec,comp
2,thomas e willis subject pb questions organiza...,comp,comp,rec
3,jgreenamber joe green subject weitek p9000 or...,comp,comp,comp
4,jonathan mcdowell subject shuttle launch ques...,sci,alt,sci
...,...,...,...,...
18254,stupendous man subject temperature dark sky o...,sci,sci,sci
18255,jim smyton subject monitors kept 24 hours da...,comp,comp,rec
18256,subject game length braves update organizatio...,rec,rec,rec
18257,subject intel chmos 80868088 design kit news...,misc,misc,comp


In [308]:
# micro and macro F1 using word2vec
metrics.f1_score(data["label"], data["prediction_word2vec"], average="micro")

0.6894682074593351

In [309]:
metrics.f1_score(data["label"], data["prediction_word2vec"], average="macro")

0.6264010307366028

In [None]:
sum(data["prediction_word2vec"] == data["label"])/len(data)