In [231]:
import pandas as pd  
import numpy as np 
import nltk
import sklearn
import math
from collections import Counter
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split


from keras.preprocessing.text import text_to_word_sequence
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bojana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Bojana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Bojana\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Bojana\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [232]:
df = pd.read_csv('podaci.csv')
df.head()


Unnamed: 0,publish_date,headline_category,headline_text
0,20010102,unknown,Status quo will not be disturbed at Ayodhya; s...
1,20010102,unknown,Fissures in Hurriyat over Pak visit
2,20010102,unknown,America's unwanted heading for India?
3,20010102,unknown,For bigwigs; it is destination Goa
4,20010102,unknown,Extra buses to clear tourist traffic


In [233]:
# Iz DataFrame-a sam izbacila kolonu publish_date jer mi nije potrebna, kao i sve vrste u kojima je vrednost headline_category jednaka unknown ili removed.
df = df.drop('publish_date', 1)
df = df[(df['headline_category'] != 'unknown') & (df['headline_category'] != 'removed')].reset_index()
df.drop(['index'], axis=1, inplace=True)
df.head()

Unnamed: 0,headline_category,headline_text
0,entertainment.hindi.bollywood,Raju Chacha
1,entertainment.hindi.bollywood,'Devdas': Jinxed?
2,india,Dudhwa tiger died of starvation; not poisoning
3,city.bengaluru,Three in race for chief secy's post
4,city.patna,Druggists' stir leads to shortage of medicines


In [234]:
# Zatim, cilj mi je da od kolone headline_category napravim novu kolonu 'category' tako sto elemente koji sadrze rec 'sports' iz te kolone ubacim u grupu 'sports' u koloni 'category',
# elemente koji sadrze rec 'politics' ubacim u grupu 'politics', a svi ostali elementi te kolone spadaju u grupu others. 

conditions = [
    (df['headline_category'].str.contains('sports')),
    (df['headline_category'].str.contains('politics')),
    ]
choices = ['sports', 'politics']
df['category'] = np.select(conditions, choices, default='others')

In [235]:
# Kada sam napravila novu kolonu 'category', onda mogu da obrisem headline_category jer mi vise ne treba.
df = df.drop('headline_category', 1)

In [236]:
# Kolonu 'category' postavljam kao target i uzimam njene jedinstvene vrednosti, samo da proverim da li je to ono sto mi treba.
target_category = df['category'].unique()
target_category

array(['others', 'sports', 'politics'], dtype=object)

In [237]:
# Kako mi nije pogodno da radim sa stringovima, onda dodajem kolonu categoryId u kojoj mi je others =0 , sports = 1, politics = 2.
df['categoryId'] = df['category'].factorize()[0]
df.head()

Unnamed: 0,headline_text,category,categoryId
0,Raju Chacha,others,0
1,'Devdas': Jinxed?,others,0
2,Dudhwa tiger died of starvation; not poisoning,others,0
3,Three in race for chief secy's post,others,0
4,Druggists' stir leads to shortage of medicines,others,0


In [238]:
category = df[["category","categoryId"]].drop_duplicates().sort_values('categoryId').reset_index().drop('index', 1)
category

Unnamed: 0,category,categoryId
0,others,0
1,sports,1
2,politics,2


In [239]:
# Dolazim do dela kada zelim da sredim podatke iz dataframe-a, pa to radim uz pomoc sledece funkcije.

def preprocess_df(train_text):
       
    #prvo transformisem tekst u listu reci, i pritom ignorisem znakove interpunkcije
    train_text= str(train_text)
    tokenized_train_set = text_to_word_sequence(train_text,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" ")
        
    #izbacujem reci kao sto su: the, in, on, at.. jer mi one ne menjaju mnogo znacenje recenice, a veoma su ceste u tekstu. 
    stop_words = set(stopwords.words('english'))
    stopwordremove = [i for i in tokenized_train_set if not i in stop_words]
     
    #zatim ponovo spajam reci u recenice
    stopwordremove_text = ' '.join(stopwordremove)
        
    #iz tih recenica zelim da uklonim brojeve, jer mi oni ne znace.
    numberremove_text = ''.join(c for c in stopwordremove_text if not c.isdigit())
       
    #dalje koristim PorterStemmer, koji ostavlja samo koren reci, a lemmatizer reci koje imaju isti koren posmatra kao jedno. 
    stemmer= PorterStemmer()

    stem_input=nltk.word_tokenize(numberremove_text)
    stem_text=' '.join([stemmer.stem(word) for word in stem_input])
        
    lemmatizer = WordNetLemmatizer()

    def get_wordnet_pos(word):
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

        return tag_dict.get(tag, wordnet.NOUN)

    lem_input = nltk.word_tokenize(stem_text)
    lem_text= ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in lem_input])
        
    return lem_text

In [240]:
#DataFrame je dosta veliki, zato cu uzeti samo deo podataka. Od others i sports cu izdvojiti po 1000, dok cu za politics da ostavim svih 1620.
df.category.value_counts()

others      3077517
sports       130535
politics       1620
Name: category, dtype: int64

In [241]:
others = df[df.category == 'others']
df.drop(df[df.category == 'others'].index, inplace=True)
sports = df[df.category == 'sports']
df.drop(df[df.category == 'sports'].index, inplace=True)

In [242]:
df_final = pd.concat([df, others.head(1000), sports.head(1000)], ignore_index=True)
df_final

Unnamed: 0,headline_text,category,categoryId
0,Acche din kidhar hain; asks PIL against PM and...,politics,2
1,Call to widen 'health tax' net,politics,2
2,Hike VAT on cigarettes: Harsh Vardhan,politics,2
3,Maharashtra onion politics may bring tears to ...,politics,2
4,Budget 2014: Opposition flags price rise to di...,politics,2
...,...,...,...
3615,Aston Villa see future business in India,sports,1
3616,Benitez praises Liverpool spirit,sports,1
3617,We are in heaven now; says Dudek,sports,1
3618,Safin; Sharapova advance,sports,1


In [243]:
df_final['headline_text'] = df_final['headline_text'].apply(preprocess_df)
text = df_final['headline_text']
category = df_final['category']
text.head()

0                  acch din kidhar hain ask pil pm bjp
1                         call widen 'health tax ' net
2                      hike vat cigarett harsh vardhan
3    maharashtra onion polit may bring tear modi go...
4    budget opposit flag price rise disrupt day par...
Name: headline_text, dtype: object

In [244]:
#delim na train i test. Zatim pravim KNN, Naive Bayes i Logistic Regression, pa racunam accuracy za njih.
X_train, X_test, Y_train, Y_test = train_test_split(text,category, test_size = 0.2, random_state = 60,shuffle=True, stratify=category)

In [245]:
knn = Pipeline([('tfidf', TfidfVectorizer()), ('clf', KNeighborsClassifier(n_neighbors = 3))])
knn.fit(X_train, Y_train)

test_predict = knn.predict(X_test)

test_accuracy =(accuracy_score(test_predict, Y_test))

print("KNN Test Accuracy Score  : {} ".format(test_accuracy ))
print()


KNN Test Accuracy Score  : 0.787292817679558 



In [246]:
bayes = Pipeline([('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])
bayes.fit(X_train,Y_train)

test_predict = bayes.predict(X_test)

test_accuracy =(accuracy_score(test_predict, Y_test))

print("Naive Bayes Test Accuracy Score  : {} ".format(test_accuracy ))
print()


Naive Bayes Test Accuracy Score  : 0.7886740331491713 



In [247]:
regression = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LogisticRegression(random_state=0)),])
regression.fit(X_train,Y_train)

test_predict = regression.predict(X_test)

test_accuracy =(accuracy_score(test_predict, Y_test))

print("Logistic Regression Test Accuracy Score  : {} ".format(test_accuracy ))
print()


Logistic Regression Test Accuracy Score  : 0.8798342541436464 



In [248]:
#Najbolji accuracy dobijam za Logistic Regression. Desava mi se da kada menjam parametre, izbacuje mi gresku jer nemam dovoljno memorije.
#Stoga, ostavila sam samo dva parametra da se menjaju, ostali su zakomentarisani.
grid_params = {
    #'tfidf__lowercase': [True, False],
    #'tfidf__binary': [True, False],
    #'tfidf__max_features': [None, 100000, 10000],
    'tfidf__norm': ['l2', 'l1'],
    #'tfidf__stop_words': [None, stopwords],
    #'clf__C': [1.0, 0.1, 0.01],
    #'clf__fit_intercept': [True, False],
    #'clf__penalty': ['l2', 'l1'],
    'clf__solver': ['lbfgs','saga']
}

grid_search = GridSearchCV(estimator = regression, param_grid=grid_params, scoring='accuracy')
grid_search.fit(X_train, Y_train)
score = pd.DataFrame(grid_search.cv_results_)
score

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__solver,param_tfidf__norm,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.47226,0.072944,0.02081,0.006403,lbfgs,l2,"{'clf__solver': 'lbfgs', 'tfidf__norm': 'l2'}",0.856897,0.873921,0.851468,0.841105,0.849741,0.854626,0.0109,1
1,0.28002,0.014016,0.014644,0.003765,lbfgs,l1,"{'clf__solver': 'lbfgs', 'tfidf__norm': 'l1'}",0.734483,0.740933,0.708117,0.715026,0.71848,0.723408,0.012313,4
2,0.108308,0.022439,0.013168,0.004148,saga,l2,"{'clf__solver': 'saga', 'tfidf__norm': 'l2'}",0.856897,0.873921,0.851468,0.841105,0.849741,0.854626,0.0109,1
3,0.093445,0.010912,0.006044,0.005826,saga,l1,"{'clf__solver': 'saga', 'tfidf__norm': 'l1'}",0.734483,0.74266,0.708117,0.715026,0.720207,0.724099,0.01269,3
