In [87]:
!pip install TurkishStemmer gensim unicode_tr



In [0]:
import numpy as np
import pandas as pd
import re
import nltk
import string
from nltk.corpus import stopwords
from TurkishStemmer import TurkishStemmer
from unicode_tr import unicode_tr

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, auc, f1_score, roc_auc_score

import warnings; warnings.simplefilter('ignore')

In [89]:
download_list = ["stopwords","wordnet"]
for download in download_list:
    nltk.download(download)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [90]:
data = pd.read_csv("https://raw.githubusercontent.com/naynco/nayn.data/master/classification_clean.csv")
main_categories = ['DÜNYA', 'SPOR','SANAT','Teknoloji']
filter = data["Categories"].isin(main_categories)
data = data[filter]
train_data = data[['Title', 'Categories']]
train_data.head()

Unnamed: 0,Title,Categories
12006,58 Saniyede Katar Meselesi? Katar krizi nedir?...,DÜNYA
12496,58 Saniyede Türkiye - Almanya Gerginliği,DÜNYA
12877,"Adriana Lima, Bomba Aşkla İlgili İlk Kez Konuş...",DÜNYA
12878,Galatasaraylı Taraftarlar Patladı: İstifa Edin,SPOR
12880,"Galatasaray'dan Ayrılan Sabri, Neredeyse Bedav...",SPOR


In [0]:
def process_content(doc):
    stemmer = TurkishStemmer()
    
    doc = unicode_tr(doc).lower()
    filter_punch = str.maketrans('', '', string.punctuation)
    stripped = doc.translate(filter_punch)

    clean_text = []
    for i in stripped.split():
        if i not in stopwords.words('turkish'):
            clean_text.append(stemmer.stem(i))

    return ' '.join(clean_text)

In [0]:
train_data['processed_title'] = train_data['Title'].apply(process_content)

In [93]:
train_data.head()

Unnamed: 0,Title,Categories,processed_title
12006,58 Saniyede Katar Meselesi? Katar krizi nedir?...,DÜNYA,58 saniye katar mesele katar kriz ne video
12496,58 Saniyede Türkiye - Almanya Gerginliği,DÜNYA,58 saniye türki almanya gerginlik
12877,"Adriana Lima, Bomba Aşkla İlgili İlk Kez Konuş...",DÜNYA,adrian lima bomp aşkl ilgi ilk konuş 35 yıl me...
12878,Galatasaraylı Taraftarlar Patladı: İstifa Edin,SPOR,galatasaray taraftar patl istifa et
12880,"Galatasaray'dan Ayrılan Sabri, Neredeyse Bedav...",SPOR,galatasaray ayrılan sabri nere bedava kayseris...


In [94]:
categories = train_data['Categories']
titles = train_data['processed_title']
N = len(titles)
print('Number of news',N)

Number of news 11622


In [95]:
labels = list(set(categories))
n_classes = len(labels)
print('possible categories',labels)

possible categories ['Teknoloji', 'SPOR', 'DÜNYA', 'SANAT']


In [96]:
for l in labels:
    print('number of ',l,' news',len(train_data.loc[train_data['Categories'] == l]))

number of  Teknoloji  news 144
number of  SPOR  news 1967
number of  DÜNYA  news 9226
number of  SANAT  news 285


In [0]:
X_train, X_test, y_train, y_test = train_test_split(train_data['processed_title'],train_data['Categories'],test_size=0.2,random_state=57)

In [0]:
model = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LogisticRegression()),
                     ])

In [0]:
text_clf = model.fit(X_train, y_train)

In [0]:
predicted = model.predict(X_test)

In [101]:
confusion_matrix(y_test,predicted)

array([[1835,    2,    5,    0],
       [  53,    3,    0,    0],
       [ 136,    0,  263,    0],
       [  28,    0,    0,    0]])

In [102]:
print('accuracy_score',accuracy_score(y_test,predicted))
print('Reporting...')

accuracy_score 0.9036559139784947
Reporting...


In [103]:
print(classification_report(y_test, predicted, target_names=labels))

              precision    recall  f1-score   support

   Teknoloji       0.89      1.00      0.94      1842
        SPOR       0.60      0.05      0.10        56
       DÜNYA       0.98      0.66      0.79       399
       SANAT       0.00      0.00      0.00        28

    accuracy                           0.90      2325
   macro avg       0.62      0.43      0.46      2325
weighted avg       0.89      0.90      0.88      2325



In [104]:
cross_val_score(model, X_train, y_train, cv=5)

array([0.89199355, 0.90107527, 0.89892473, 0.89940828, 0.90091546])

In [105]:
cross_val_score(model, X_test, y_test, cv=5)

array([0.85867238, 0.86051502, 0.86236559, 0.84267241, 0.85529158])

In [0]:
def predict_title(model, new_data):
    test_data = pd.DataFrame(new_data, columns=['Title'])
    test_data['processed_title'] = test_data['Title'].apply(process_content)
    
    X_test = test_data['processed_title']
    predictions = model.predict(X_test)
    
    return predictions

In [116]:
t1 = ['Fenerbahçe, Neustadter Transferini Borsaya Bildirdi']
news_title = pd.DataFrame(t1, columns=['Title'])
predict_title(model, t1)

array(['SPOR'], dtype=object)