In [1]:
from nltk.corpus import stopwords
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier

import re
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mahmoud/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
df=pd.read_csv('bbc-text.csv')

In [4]:
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [5]:
X_train, y_train = df['text'].values, df['category'].values

In [6]:
encode_y={'business':0,'entertainment':1,'politics':2,'sport':3,'tech':4}
decode_y={'0':'business','1':'entertainment','2':'politics','3':'sport','4':'tech'}
y_train=[encode_y[yt] for yt in y_train]

In [7]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
    text =text.lower()
    text = re.sub(REPLACE_BY_SPACE_RE,' ',text)
    text = re.sub(BAD_SYMBOLS_RE,'',text)
    tx=""
    text=text.split()
    ind,ln=1,len(text)
    for t in text:
        if t not in STOPWORDS:
            if ind==ln:
                tx+=t
            else:
                tx+=t+" "
        ind+=1
            
    return tx

In [8]:
dec = int(0.1*len(X_train))
X_train = [text_prepare(x) for x in X_train]
X_val,y_val,X_train,y_train = X_train[:dec],y_train[:dec],X_train[dec:],y_train[dec:]

In [9]:
def tfidf_features(X_train,X_val):
    """
        X_train, X_val, X_test — samples        
        return TF-IDF vectorized representation of each sample and vocabulary
    """
    # Create TF-IDF vectorizer with a proper parameters choice
    # Fit the vectorizer on the train set
    # Transform the train, test, and val sets and return the result
    
    
    tfidf_vectorizer = TfidfVectorizer(token_pattern='(\S+)', min_df=5, max_df=0.9, ngram_range=(1,2))  
    
    tfidf_vectorizer.fit(X_train)
    X_train = tfidf_vectorizer.transform(X_train)
    X_val = tfidf_vectorizer.transform(X_val)

    
    return X_train, X_val,tfidf_vectorizer.vocabulary_

In [10]:
X_train_tfidf,X_val_tfidf, tfidf_vocab = tfidf_features(X_train,X_val)
tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}
print(X_train_tfidf.shape[1],X_val_tfidf.shape[1])

14266 14266


In [11]:
def train_classifier(X_train, y_train):
    """
      X_train, y_train — training data
      
      return: trained classifier
    """
    
    # Create and fit LogisticRegression wraped into OneVsRestClassifier.
    model = OneVsRestClassifier(LogisticRegression(penalty='l2', C=1.0))
    model.fit(X_train, y_train)
    return model


In [12]:
classifier_tfidf = train_classifier(X_train_tfidf, y_train)
y_val_predicted_labels_tfidf = classifier_tfidf.predict(X_val_tfidf)
y_val_predicted_scores_tfidf = classifier_tfidf.decision_function(X_val_tfidf)

In [15]:
for i in range(3):
    print('Title:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        X_val[i],
        decode_y[str(y_val[i])],
        decode_y[str(y_val_predicted_labels_tfidf[i])]
    ))

Title:	tv future hands viewers home theatre systems plasma highdefinition tvs digital video recorders moving living room way people watch tv radically different five years time according expert panel gathered annual consumer electronics show las vegas discuss new technologies impact one favourite pastimes us leading trend programmes content delivered viewers via home networks cable satellite telecoms companies broadband service providers front rooms portable devices one talkedabout technologies ces digital personal video recorders dvr pvr settop boxes like us tivo uk sky+ system allow people record store play pause forward wind tv programmes want essentially technology allows much personalised tv also builtin highdefinition tv sets big business japan us slower take europe lack highdefinition programming people forward wind adverts also forget abiding network channel schedules putting together alacarte entertainment us networks cable satellite companies worried means terms advertising r