In [17]:
import pandas as pd
import numpy as np

### Data

The following cell will download all data required for this assignment into the folder `week1/data`.

In [18]:
df=pd.read_json('dataset.json')
df1 = pd.read_json('datasetresult.json')

In [19]:
df['data']=df['headline']+' '+df['short_description']
df1['data']=df1['headline']+' '+df1['short_description']

In [20]:
Y = df['category']

In [21]:
df.drop('headline',axis = 1,inplace=True)
df.drop('short_description',axis = 1,inplace=True)
df.drop('category',axis=1,inplace=True)

df1.drop('headline',axis = 1,inplace=True)
df1.drop('short_description',axis = 1,inplace=True)



In [22]:
df.head()

Unnamed: 0,id,data
0,1,There Were 2 Mass Shootings In Texas Last Week...
1,2,Will Smith Joins Diplo And Nicky Jam For The 2...
2,3,Hugh Grant Marries For The First Time At Age 5...
3,4,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,5,Julianna Margulies Uses Donald Trump Poop Bags...


In [23]:
df1.head()

Unnamed: 0,id,data
0,1,"Elderly, Conservatives Spread The Most Fake Ne..."
1,2,Evangelical Pastor Defends Trump's Border Plan...
2,3,Progressive Groups Blast Centrist Tilt Of Hous...
3,4,Petition Asks Trump To Merge North And South D...
4,5,Dems Introduce Bill To Shield Unpaid Federal W...


### Text preprocessing

For this and most of the following assignments you will need to use a list of stop words. It can be downloaded from *nltk*:

In [10]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
X_train= df
y_train = Y
X_test = df1


In [25]:
y_train.head()

0            crime
1    entertainment
2    entertainment
3    entertainment
4    entertainment
Name: category, dtype: object

In [26]:
X_test.head()

Unnamed: 0,id,data
0,1,"Elderly, Conservatives Spread The Most Fake Ne..."
1,2,Evangelical Pastor Defends Trump's Border Plan...
2,3,Progressive Groups Blast Centrist Tilt Of Hous...
3,4,Petition Asks Trump To Merge North And South D...
4,5,Dems Introduce Bill To Shield Unpaid Federal W...


In [27]:
X_train.head()

Unnamed: 0,id,data
0,1,There Were 2 Mass Shootings In Texas Last Week...
1,2,Will Smith Joins Diplo And Nicky Jam For The 2...
2,3,Hugh Grant Marries For The First Time At Age 5...
3,4,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,5,Julianna Margulies Uses Donald Trump Poop Bags...


In [28]:
import re

In [29]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = list(set(stopwords.words('english')))

def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower()# lowercase text
    text = text = re.sub(REPLACE_BY_SPACE_RE,' ',text).strip() # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = text = re.sub(BAD_SYMBOLS_RE,'',text).strip()# delete symbols which are in BAD_SYMBOLS_RE from text
    text = text = ' '.join([word for word in text.split() if word not in STOPWORDS]).strip()# delete stopwords from text
    return text.strip()

In [30]:
X_train, y_train, ids,X_test = X_train['data'].values, y_train.values,X_test['id'],X_test['data']

In [31]:
X_train = [text_prepare(x) for x in X_train]
X_test = [text_prepare(x) for x in X_test]

In [34]:
print(X_train[:3])
print(X_test[:3])

['2 mass shootings texas last week 1 tv left husband killed children another day america', 'smith joins diplo nicky jam 2018 world cups official song course song', 'hugh grant marries first time age 57 actor longtime girlfriend anna eberstein tied knot civil ceremony']
['elderly conservatives spread fake news 2016 election new study finds americans shared false information trumps presidential campaign demographics', 'evangelical pastor defends trumps border plan heaven gonna wall robert jeffress cited bible fox friends support presidents plans border wall christians agree', 'progressive groups blast centrist tilt house democrats campaign arm rep cheri bustos leadership team dccc drawn exclusively businessfriendly new democrat coalition']


In [35]:
for i in y_train[:3]:
  print(i)

crime
entertainment
entertainment


In [36]:
# Dictionary of all categories from train corpus with their counts.
# Dictionary of all words from train corpus with their counts.
category_counts = {}
words_counts = {}
for i in range(len(X_train)):
  topic=X_train[i].split()
  for word in topic:
    if word in words_counts.keys():
      words_counts[word]+=1
    else:
      words_counts[word]=0
for category in y_train:
   if category in category_counts.keys():
      category_counts[category]+=1
   else:
      category_counts[category]=0


In [37]:
most_common_category = sorted(category_counts.items(), key=lambda x: x[1], reverse=True)[:3]
most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:3]

print(most_common_category)
print(most_common_words)

[('politics', 32738), ('wellness', 17826), ('entertainment', 16057)]
[('new', 18697), ('one', 15486), ('us', 14431)]


In [38]:
DICT_SIZE = 15000
WORDS_TO_INDEX={}
INDEX_TO_WORDS={}

most_common = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:DICT_SIZE]
for i in range(DICT_SIZE):
  word=most_common[i][0]
  WORDS_TO_INDEX[word]=i
  INDEX_TO_WORDS[i]=word
  
ALL_WORDS = WORDS_TO_INDEX.keys()

def my_bag_of_words(text, words_to_index, dict_size):
    result_vector = np.zeros(dict_size,dtype=int)
    all = words_to_index.keys()
    
    for word in text.split():
      if word in all:
        result_vector[words_to_index[word]]+=1    
      else:
        pass
    return result_vector

In [39]:
from scipy import sparse as sp_sparse

In [40]:
X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test])
print('X_train shape ', X_train_mybag.shape)
print('X_test shape ', X_test_mybag.shape)

X_train shape  (200853, 15000)
X_test shape  (980, 15000)


In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [43]:
def tfidf_features(X_train,X_test):    
    
    tfidf = TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1,2),token_pattern = '(\S+)')
    
    X_train = tfidf.fit_transform(X_train)
    X_test = tfidf.transform(X_test)
       
    return X_train, X_test, tfidf.vocabulary_

In [44]:
X_train_tfidf,X_test_tfidf, tfidf_vocab = tfidf_features(X_train,X_test)


In [45]:
tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}

In [46]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier

In [47]:
def train_classifier(X_train, y_train):
   
    clsfr=OneVsRestClassifier(LogisticRegression(random_state=7,penalty='l1',C=0.7),n_jobs=-1)
    clsfr.fit(X_train,y_train)
    return clsfr
    

Train the classifiers for different data transformations: *bag-of-words* and *tf-idf*.

In [48]:
classifier_mybag = train_classifier(X_train_mybag, y_train)
classifier_tfidf = train_classifier(X_train_tfidf, y_train)

Now you can create predictions for the data. You will need two types of predictions: labels and scores.

In [49]:
y_test_predicted_labels_mybag = classifier_mybag.predict(X_train_mybag)
y_test_predicted_scores_mybag = classifier_mybag.decision_function(X_train_mybag)
y_test_predicted_labels_tfidf = classifier_tfidf.predict(X_train_tfidf)
y_test_predicted_scores_tfidf = classifier_tfidf.decision_function(X_train_tfidf)

In [50]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [51]:
def print_evaluation_scores(y_val, predicted):
    print('accuracy: ',accuracy_score(y_val,predicted))
    print('f1_score_micro: ',f1_score(y_val,predicted,average = 'micro'))

In [52]:
print('Bag-of-words')
print_evaluation_scores(y_train, y_test_predicted_labels_mybag)
print('Tfidf')
print_evaluation_scores(y_train, y_test_predicted_labels_tfidf)

Bag-of-words
accuracy:  0.7181321663106849
f1_score_micro:  0.7181321663106849
Tfidf
accuracy:  0.632547186250641
f1_score_micro:  0.632547186250641


In [53]:
from sklearn.linear_model import SGDClassifier
from sklearn import svm

In [54]:
clsfr=SGDClassifier()
clsfr_mbg  = SGDClassifier()

In [55]:
clsfr.fit(X_train_tfidf,y_train)
clsfr_mbg.fit(X_train_mybag,y_train)



SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [56]:
pred=clsfr.predict(X_train_tfidf)
pred2=clsfr_mbg.predict(X_train_mybag)

In [57]:
print_evaluation_scores(y_train, pred)
print_evaluation_scores(y_train,pred2)

accuracy:  0.7178135253145335
f1_score_micro:  0.7178135253145336
accuracy:  0.6848789911029459
f1_score_micro:  0.6848789911029459


In [58]:
out = clsfr.predict(X_test_tfidf)

In [59]:
df2=pd.DataFrame({'id' : ids,
                'category':out })

In [60]:
df2.head()

Unnamed: 0,id,category
0,1,politics
1,2,politics
2,3,politics
3,4,politics
4,5,politics


In [61]:
df2.to_csv('submission.csv',index=False)

In [53]:
df3=pd.read_json('submission.json')

In [54]:
df3.head()

Unnamed: 0,id,category
0,1,crime
1,2,entertainment
10,11,entertainment
100,101,politics
1000,1001,crime
