https://medium.com/@MSalnikov/text-clustering-with-k-means-and-tf-idf-f099bcf95183          
https://www.kaggle.com/nxtasha/bbc-news-classification-natasha/data

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re, string

In [54]:
from sklearn.model_selection import train_test_split
from sklearn.cluster import DBSCAN, KMeans, AgglomerativeClustering
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn import svm

In [3]:
pd.set_option('display.max_colwidth', 100)

##  Load Data

In [4]:
df = pd.read_csv('data/bbc-text.csv')
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home theatre systems plasma high-definition tvs and dig...
1,business,worldcom boss left books alone former worldcom boss bernie ebbers who is accused of overseein...
2,sport,tigers wary of farrell gamble leicester say they will not be rushed into making a bid for andy...
3,sport,yeading face newcastle in fa cup premiership side newcastle united face a trip to ryman premier ...
4,entertainment,ocean s twelve raids box office ocean s twelve the crime caper sequel starring george clooney ...


In [5]:
pd.DataFrame(df['category'].value_counts())

Unnamed: 0,category
sport,511
business,510
politics,417
tech,401
entertainment,386


In [6]:
data = df['text'].values
labels = df['category'].values

## Feature Engineering

In [7]:
y = LabelEncoder().fit_transform(labels)

In [8]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df.text).toarray()

In [9]:
pd.DataFrame(features).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14405,14406,14407,14408,14409,14410,14411,14412,14413,14414
0,0.0,0.024458,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.032118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# K-Means

In [13]:
kmeans = KMeans(n_clusters=5).fit(features)

In [14]:
pred = kmeans.predict(features)

In [15]:
pred[:10]

array([3, 0, 1, 1, 2, 4, 4, 1, 1, 2], dtype=int32)

In [16]:
y[:10]

array([4, 0, 3, 3, 1, 2, 2, 3, 3, 1])

In [17]:
for i,x in enumerate(pred):
    if x == 0:
        pred[i] = 0
    elif x == 1:
        pred[i] = 3
    elif x == 2:
        pred[i] = 1
    elif x == 3:
        pred[i] = 4
    elif x == 4:
        pred[i] = 2

### Score

In [18]:
accuracy_score(y, pred)

0.9676404494382023

# Linear Regression

In [37]:
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2, random_state=42)

In [38]:
reg = LinearRegression().fit(X_train, y_train)

In [43]:
pred = reg.predict(X_test)
pred = np.round(pred).astype(int)

### Score

In [40]:
pred.shape, y_test.shape

((445,), (445,))

In [44]:
reg.score(X_test, y_test)

0.8458790190832539

In [45]:
accuracy_score(pred, y_test)

0.7303370786516854

# Logistic Regression 

In [25]:
clf = LogisticRegression(random_state=42, solver='lbfgs', multi_class='multinomial').fit(X_train, y_train)

### Score

In [26]:
clf.score(X_test, y_test)

0.9707865168539326

In [27]:
pred = clf.predict(X_test)

In [28]:
accuracy_score(pred, y_test)

0.9707865168539326

In [29]:
clf.coef_

array([[-2.54556364e-02,  3.24827163e-01, -3.74273179e-02, ...,
         6.77013234e-02, -6.38578159e-03, -3.01738464e-02],
       [ 2.32499390e-02,  2.96494511e-01, -2.88844151e-02, ...,
        -9.37563136e-02, -4.08868599e-02, -3.07668598e-02],
       [ 7.63660199e-03,  7.71530210e-02,  7.06789307e-04, ...,
         8.12237031e-03, -3.65419996e-02, -2.75537483e-02],
       [ 1.29106057e-02, -7.94143430e-01, -2.27488575e-02, ...,
         2.46180162e-02,  9.32923995e-02,  1.12246733e-01],
       [-1.83415103e-02,  9.56687349e-02,  8.83538012e-02, ...,
        -6.68539629e-03, -9.47775836e-03, -2.37522785e-02]])

In [30]:
clf.intercept_

array([ 0.14371304,  0.13759294, -0.24404525,  0.24268287, -0.2799436 ])

# SVM

In [55]:
clf = svm.LinearSVC().fit(X_train, y_train)

In [56]:
clf.score(X_test, y_test)

0.9707865168539326

In [57]:
pred = clf.predict(X_test)

In [58]:
accuracy_score(pred, y_test)

0.9707865168539326

In [60]:
clf = svm.SVC(kernel='linear').fit(X_train, y_train)

In [66]:
svm.SVC(kernel='linear').fit(X_train, y_train).score(X_test, y_test)

0.9775280898876404

In [65]:
svm.SVC(kernel='rbf').fit(X_train, y_train).score(X_test, y_test)



0.2202247191011236

In [68]:
svm.SVC(kernel='sigmoid').fit(X_train, y_train).score(X_test, y_test)



0.2202247191011236

In [67]:
svm.SVC(kernel='poly').fit(X_train, y_train).score(X_test, y_test)



0.2202247191011236

In [61]:
clf.score(X_test, y_test)

0.9775280898876404

In [57]:
pred = clf.predict(X_test)

In [58]:
accuracy_score(pred, y_test)

0.9707865168539326