In [1]:
# import the required packages
import numpy as np
import pandas as pd
import string
import re
import nltk
import csv

from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder

from sklearn.model_selection import train_test_split 
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

## 1. Text Data Preprocessing

### 1.1 Load the Text Data

In [2]:
dat = pd.read_csv('bbc-text.csv')

In [3]:
dat.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [4]:
dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  2225 non-null   object
 1   text      2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB


In [5]:
dat['category'].value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

In [6]:
dat.describe()

Unnamed: 0,category,text
count,2225,2225
unique,5,2126
top,sport,stars pay tribute to actor davis hollywood sta...
freq,511,2


### 1.2 Cleanse the Text Data

In [7]:
# First, remove all punctuations.
def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

dat['text_clean'] = dat['text'].apply(lambda x: remove_punct(x))

dat.head()

Unnamed: 0,category,text,text_clean
0,tech,tv future in the hands of viewers with home th...,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...,ocean s twelve raids box office ocean s twelve...


In [8]:
# Next, tokenization
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

dat['text_tokenized'] = dat['text_clean'].apply(lambda x: tokenize(x.lower()))

dat.head()

Unnamed: 0,category,text,text_clean,text_tokenized
0,tech,tv future in the hands of viewers with home th...,tv future in the hands of viewers with home th...,"[tv, future, in, the, hands, of, viewers, with..."
1,business,worldcom boss left books alone former worldc...,worldcom boss left books alone former worldc...,"[worldcom, boss, left, books, alone, former, w..."
2,sport,tigers wary of farrell gamble leicester say ...,tigers wary of farrell gamble leicester say ...,"[tigers, wary, of, farrell, gamble, leicester,..."
3,sport,yeading face newcastle in fa cup premiership s...,yeading face newcastle in fa cup premiership s...,"[yeading, face, newcastle, in, fa, cup, premie..."
4,entertainment,ocean s twelve raids box office ocean s twelve...,ocean s twelve raids box office ocean s twelve...,"[ocean, s, twelve, raids, box, office, ocean, ..."


In [9]:
def get_stop_words(stop_file_path):
    """load stop words """
    
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)

#load a set of stop words
stopwords=get_stop_words("stopwords.txt")

def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopwords]
    return text

dat['text_nostop'] = dat['text_tokenized'].apply(lambda x: remove_stopwords(x))

dat.head()

Unnamed: 0,category,text,text_clean,text_tokenized,text_nostop
0,tech,tv future in the hands of viewers with home th...,tv future in the hands of viewers with home th...,"[tv, future, in, the, hands, of, viewers, with...","[tv, future, hands, viewers, home, theatre, sy..."
1,business,worldcom boss left books alone former worldc...,worldcom boss left books alone former worldc...,"[worldcom, boss, left, books, alone, former, w...","[worldcom, boss, left, books, former, worldcom..."
2,sport,tigers wary of farrell gamble leicester say ...,tigers wary of farrell gamble leicester say ...,"[tigers, wary, of, farrell, gamble, leicester,...","[tigers, wary, farrell, gamble, leicester, rus..."
3,sport,yeading face newcastle in fa cup premiership s...,yeading face newcastle in fa cup premiership s...,"[yeading, face, newcastle, in, fa, cup, premie...","[yeading, newcastle, fa, cup, premiership, new..."
4,entertainment,ocean s twelve raids box office ocean s twelve...,ocean s twelve raids box office ocean s twelve...,"[ocean, s, twelve, raids, box, office, ocean, ...","[ocean, twelve, raids, box, office, ocean, twe..."


In [10]:
# Lemmatize tokenized text

wn = nltk.WordNetLemmatizer()

def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

dat['text_lemmatized'] = dat['text_nostop'].apply(lambda x: lemmatizing(x))

# Detokenize the data
dat['text_lemmatized'] = dat['text_lemmatized'].apply(lambda x: TreebankWordDetokenizer().detokenize(x))

dat.head(5)

Unnamed: 0,category,text,text_clean,text_tokenized,text_nostop,text_lemmatized
0,tech,tv future in the hands of viewers with home th...,tv future in the hands of viewers with home th...,"[tv, future, in, the, hands, of, viewers, with...","[tv, future, hands, viewers, home, theatre, sy...",tv future hand viewer home theatre system plas...
1,business,worldcom boss left books alone former worldc...,worldcom boss left books alone former worldc...,"[worldcom, boss, left, books, alone, former, w...","[worldcom, boss, left, books, former, worldcom...",worldcom bos left book former worldcom bos ber...
2,sport,tigers wary of farrell gamble leicester say ...,tigers wary of farrell gamble leicester say ...,"[tigers, wary, of, farrell, gamble, leicester,...","[tigers, wary, farrell, gamble, leicester, rus...",tiger wary farrell gamble leicester rushed bid...
3,sport,yeading face newcastle in fa cup premiership s...,yeading face newcastle in fa cup premiership s...,"[yeading, face, newcastle, in, fa, cup, premie...","[yeading, newcastle, fa, cup, premiership, new...",yeading newcastle fa cup premiership newcastle...
4,entertainment,ocean s twelve raids box office ocean s twelve...,ocean s twelve raids box office ocean s twelve...,"[ocean, s, twelve, raids, box, office, ocean, ...","[ocean, twelve, raids, box, office, ocean, twe...",ocean twelve raid box office ocean twelve crim...


In [11]:
dat.shape

(2225, 6)

### 1.3 Bag-of-Word

In [12]:
count_vect = CountVectorizer(max_df=0.15, min_df=0.05, max_features=3000, ngram_range=(1,2))
text_counts = count_vect.fit_transform(dat['text_lemmatized'].tolist())
text_counts.shape

(2225, 451)

In [13]:
sum_words = text_counts.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in count_vect.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

In [14]:
print('The highest frequency words:')
words_freq[0:5] 

The highest frequency words:


[('film', 1113),
 ('music', 835),
 ('labour', 796),
 ('party', 778),
 ('sale', 734)]

In [15]:
print('The lowest frequency words:') 
words_freq[-5:] 

The lowest frequency words:


[('take', 126),
 ('tough', 125),
 ('decided', 125),
 ('remains', 124),
 ('13', 121)]

In [16]:
feature_names=np.array(count_vect.get_feature_names())
len(count_vect.get_feature_names())

451

In [17]:
pd.DataFrame(text_counts.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,441,442,443,444,445,446,447,448,449,450
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,2,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2220,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2221,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2222,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2223,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0


### 1.4 TF-IDF

In [18]:
tfidf_transformer=TfidfTransformer(use_idf=True, smooth_idf=True)
text_tfidf = tfidf_transformer.fit_transform(text_counts)

In [19]:
tfidf_transformer.idf_

array([3.63278772, 3.56046706, 3.37042345, 3.9630294 , 3.62655717,
       3.97176308, 3.8176124 , 3.24837602, 3.65171573, 3.76631911,
       2.95856855, 3.02778892, 3.81012173, 3.52617798, 3.87167962,
       3.6453665 , 3.16278409, 3.73814823, 2.89682054, 3.73814823,
       3.84042708, 3.67752361, 3.15888545, 3.95437134, 3.92883804,
       3.89577718, 3.40964417, 3.6906817 , 3.28301151, 3.6080951 ,
       3.51500468, 3.46093746, 3.85593127, 3.13201243, 3.36562728,
       3.84042708, 3.98057371, 3.86377445, 3.73814823, 3.86377445,
       3.9457876 , 3.6080951 , 3.54890623, 3.54317556, 3.54890623,
       3.90394049, 3.68408101, 3.67752361, 3.75213447, 3.2786159 ,
       3.89577718, 3.80268675, 3.22732261, 3.58996772, 3.92883804,
       3.5094645 , 3.87964779, 3.89577718, 3.93727691, 3.83276421,
       3.84814913, 3.92046979, 3.92883804, 3.1472799 , 2.99093383,
       3.95437134, 3.86377445, 2.99093383, 3.69732624, 3.4556881 ,
       3.13961703, 3.43496197, 3.98057371, 3.75213447, 3.92883

In [20]:
tfidf_transformer.idf_.shape

(451,)

In [21]:
# find maximum value for each of the features over dataset:
max_value = text_tfidf.max(axis=0).toarray().ravel()
sorted_by_tfidf = max_value.argsort()

print("Features with lowest tfidf:\n{}".format(
      feature_names[sorted_by_tfidf[:20]]))

print("Features with highest tfidf: \n{}".format(
      feature_names[sorted_by_tfidf[-20:]]))

Features with lowest tfidf:
['particularly' 'reason' 'tony' 'themselves' 'similar' 'tony blair' 'take'
 'reach' 'bbc news' 'despite' 'remain' 'simply' 'brought' 'due' 'told bbc'
 'potential' 'thursday' '30' 'instead' 'according']
Features with highest tfidf: 
['book' 'goal' 'charles' 'moment' 'net' 'report' 'election' 'project' 'pc'
 'radio' 'right' 'child' 'woman' 'card' 'mobile' '50' 'music' 'award'
 'bank' 'film']


In [22]:
pd.DataFrame(text_tfidf.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,441,442,443,444,445,446,447,448,449,450
0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.049065,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
1,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.124578,0.000000,0.144441,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
2,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
3,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.250514,0.000000,0.000000,0.0,0.000000,0.000000
4,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.158266,0.000000,...,0.000000,0.0,0.000000,0.0,0.344275,0.000000,0.000000,0.0,0.000000,0.167457
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2220,0.0,0.062881,0.000000,0.0,0.0,0.000000,0.067423,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
2221,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
2222,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.153692,0.000000,0.0,0.000000,0.000000
2223,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.081735,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.086711,0.000000


## 2. Text Data Understanding

### 2.1 Extract keywords using TF-IDF matrix

In [23]:
#number of top keywords to extract
topn = 7
results =[]

In [24]:
for idx, doc in dat['text_lemmatized'].items():
    #generate tf-idf for the given document
    tf_idf_vector=text_tfidf[idx]
    
    temp = pd.DataFrame(zip(tf_idf_vector.tocoo().col, tf_idf_vector.tocoo().data),columns=['feature_number','tf_idf'])
    temp.sort_values('tf_idf', ascending = False, inplace = True)
    
    #use only topn items from vector
     
    topn_items = temp[:topn]

    tf_idf = []
    word = []

    for index, row in topn_items.iterrows():
        #print(int(row['feature_number']))
        fname = feature_names[int(row['feature_number'])]
        word.append(fname)
        tf_idf.append(round(row['tf_idf'], 3))

    result = dict(zip(word, tf_idf))
    
    results.append(result)

In [25]:
len(results)

2225

In [26]:
#add new columns

dat['keywords + tfidf'] = results
dat.head()

Unnamed: 0,category,text,text_clean,text_tokenized,text_nostop,text_lemmatized,keywords + tfidf
0,tech,tv future in the hands of viewers with home th...,tv future in the hands of viewers with home th...,"[tv, future, in, the, hands, of, viewers, with...","[tv, future, hands, viewers, home, theatre, sy...",tv future hand viewer home theatre system plas...,"{'tv': 0.715, 'network': 0.271, 'technology': ..."
1,business,worldcom boss left books alone former worldc...,worldcom boss left books alone former worldc...,"[worldcom, boss, left, books, alone, former, w...","[worldcom, boss, left, books, former, worldcom...",worldcom bos left book former worldcom bos ber...,"{'bos': 0.443, 'book': 0.302, 'admitted': 0.30..."
2,sport,tigers wary of farrell gamble leicester say ...,tigers wary of farrell gamble leicester say ...,"[tigers, wary, of, farrell, gamble, leicester,...","[tigers, wary, farrell, gamble, leicester, rus...",tiger wary farrell gamble leicester rushed bid...,"{'union': 0.474, 'involved': 0.329, 'club': 0...."
3,sport,yeading face newcastle in fa cup premiership s...,yeading face newcastle in fa cup premiership s...,"[yeading, face, newcastle, in, fa, cup, premie...","[yeading, newcastle, fa, cup, premiership, new...",yeading newcastle fa cup premiership newcastle...,"{'cup': 0.466, 'united': 0.454, 'meet': 0.358,..."
4,entertainment,ocean s twelve raids box office ocean s twelve...,ocean s twelve raids box office ocean s twelve...,"[ocean, s, twelve, raids, box, office, ocean, ...","[ocean, twelve, raids, box, office, ocean, twe...",ocean twelve raid box office ocean twelve crim...,"{'office': 0.42, 'robert': 0.345, 'weekend': 0..."


### 2.2 Association Rules Mining on keywords

In [27]:
dat['keywords'] = dat['keywords + tfidf'].apply(lambda x: list(x.keys()))

with open("ARMKeywords.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(dat['keywords'])

ARMdata = pd.read_csv('ARMKeywords.csv', header = None)

full_list=pd.Series([])
for col in ARMdata:
    full_list = full_list.append(ARMdata[col].dropna())

y = full_list.value_counts().head(50).to_frame()

trans = []
for i in range(0, 2225):
    trans.append([str(ARMdata.values[i,j]) for j in range(0, 7)])

# conveting it into an numpy array
trans = np.array(trans)

# transform into one-hot encoded NumPy boolean array
te = TransactionEncoder()
data_encoded = te.fit_transform(trans)
data_encoded = pd.DataFrame(data_encoded, columns = te.columns_)

data_encoded = data_encoded.loc[:, y.index]

# Generate the Frequent Itemsets using Apriori Algorithm and set Support threshold as 0.01
frequent_itemsets=apriori(data_encoded, min_support = 0.01, use_colnames = True)
print("\n>>>Generate the Frequent Itemsets using Apriori Algorithm and set Support threshold as 0.01")
print(frequent_itemsets.head())
rules_c = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3)
rules_c.sort_values('confidence', ascending = False, inplace = True)
print("\n>>> Generate association rules using Confidence threshold")
print(rules_c.head(5))
rules_l = association_rules(frequent_itemsets, metric="lift", min_threshold=11)
rules_l.sort_values('lift', ascending = False, inplace = True)
print("\n>>> Generate association rules using Lift threshold")
print(rules_l.head(5))
print("\n>>> Generate association rules using both Lift and Confidence threshold")
print(rules_c[ (rules_c['lift'] >= 10) &
       (rules_c['confidence'] >= 0.4).head(5) ])

  full_list=pd.Series([])



>>>Generate the Frequent Itemsets using Apriori Algorithm and set Support threshold as 0.01
    support  itemsets
0  0.076404    (film)
1  0.048539  (labour)
2  0.046742   (music)
3  0.046742   (award)
4  0.046292    (sale)

>>> Generate association rules using Confidence threshold
       antecedents consequents  antecedent support  consequent support  \
38  (actor, award)      (film)            0.016180            0.076404   
27         (phone)    (mobile)            0.034607            0.036854   
3          (actor)      (film)            0.035506            0.076404   
26        (mobile)     (phone)            0.036854            0.034607   
36   (film, award)     (actor)            0.022921            0.035506   

     support  confidence       lift  leverage  conviction  
38  0.012584    0.777778  10.179739  0.011348    4.156180  
27  0.023820    0.688312  18.676750  0.022545    3.090094  
3   0.024270    0.683544   8.946389  0.021557    2.918562  
26  0.023820    0.646341  18.67

  print(rules_c[ (rules_c['lift'] >= 10) &


In [28]:
techdf = pd.DataFrame(dat.loc[dat['category'] == "tech"])

with open("ARMTechKeywords.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(techdf['keywords'])

ARMTechdata = pd.read_csv('ARMTechKeywords.csv', header = None)

tech_list=pd.Series([])
for col in ARMTechdata:
    tech_list = tech_list.append(ARMTechdata[col].dropna())
    
y = tech_list.value_counts().head(50).to_frame()

trans = []
for i in range(0, 401):
    trans.append([str(ARMTechdata.values[i,j]) for j in range(0, 7)])

# conveting it into an numpy array
trans = np.array(trans)

te = TransactionEncoder()
data_encoded = te.fit_transform(trans)
data_encoded = pd.DataFrame(data_encoded, columns = te.columns_)
data_encoded = data_encoded.loc[:, y.index]

frequent_itemsets=apriori(data_encoded, min_support = 0.01, use_colnames = True)
rules_c = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3)
rules_c.sort_values('confidence', ascending = False, inplace = True)
print("\n>>> Generate association rules using Confidence threshold")
print(rules_c.head(5))
rules_l = association_rules(frequent_itemsets, metric="lift", min_threshold=11)
rules_l.sort_values('lift', ascending = False, inplace = True)
print("\n>>> Generate association rules using Lift threshold")
print(rules_l.head(5))
print("\n>>> Generate association rules using both Lift and Confidence threshold")
print(rules_c[ (rules_c['lift'] >= 20) &
       (rules_c['confidence'] >= 0.5) ].head(5))

  tech_list=pd.Series([])



>>> Generate association rules using Confidence threshold
                                 antecedents consequents  antecedent support  \
120                          (mobile, using)     (phone)            0.012469   
153                          (user, website)      (site)            0.014963   
118                        (mobile, message)     (phone)            0.014963   
277  (technology, consumer, device, digital)     (video)            0.014963   
274   (technology, video, consumer, digital)    (device)            0.014963   

     consequent support   support  confidence       lift  leverage  conviction  
120            0.157107  0.012469         1.0   6.365079  0.010510         inf  
153            0.129676  0.014963         1.0   7.711538  0.013022         inf  
118            0.157107  0.014963         1.0   6.365079  0.012612         inf  
277            0.092269  0.014963         1.0  10.837838  0.013582         inf  
274            0.124688  0.014963         1.0   8.02000

In [29]:
busidf = pd.DataFrame(dat.loc[dat['category'] == "business"])

with open("ARMBusiKeywords.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(busidf['keywords'])

ARMBusidata = pd.read_csv('ARMBusiKeywords.csv', header = None)

busi_list=pd.Series([])
for col in ARMBusidata:
    busi_list = busi_list.append(ARMBusidata[col].dropna())

y = busi_list.value_counts().head(50).to_frame()

trans = []
for i in range(0, 510):
    trans.append([str(ARMBusidata.values[i,j]) for j in range(0, 7)])

# conveting it into an numpy array
trans = np.array(trans)
te = TransactionEncoder()
data_encoded = te.fit_transform(trans)
data_encoded = pd.DataFrame(data_encoded, columns = te.columns_)

data_encoded = data_encoded.loc[:, y.index]

frequent_itemsets=apriori(data_encoded, min_support = 0.005, use_colnames = True)
rules_c = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.8)
rules_c.sort_values('confidence', ascending = False, inplace = True)
print("\n>>> Generate association rules using Confidence threshold")
print(rules_c.head(5))
rules_l = association_rules(frequent_itemsets, metric="lift", min_threshold=15)
rules_l.sort_values('lift', ascending = False, inplace = True)
print("\n>>> Generate association rules using Lift threshold")
print(rules_l.head(5))

print("\n>>> Generate association rules using both Lift and Confidence threshold")
print(rules_c[ (rules_c['lift'] >= 20) &
       (rules_c['confidence'] >= 0.4) ].head(5))

  busi_list=pd.Series([])



>>> Generate association rules using Confidence threshold
                    antecedents consequents  antecedent support  \
0              (share, quarter)    (profit)            0.005882   
44  (european, growth, economy)  (economic)            0.005882   
23                  (job, rate)    (growth)            0.009804   
47   (analyst, growth, economy)      (rate)            0.005882   
25          (spending, quarter)    (growth)            0.007843   

    consequent support   support  confidence       lift  leverage  conviction  
0             0.129412  0.005882         1.0   7.727273  0.005121         inf  
44            0.105882  0.005882         1.0   9.444444  0.005260         inf  
23            0.125490  0.009804         1.0   7.968750  0.008574         inf  
47            0.098039  0.005882         1.0  10.200000  0.005306         inf  
25            0.125490  0.007843         1.0   7.968750  0.006859         inf  

>>> Generate association rules using Lift threshold
     

In [30]:
sportdf = pd.DataFrame(dat.loc[dat['category'] == "sport"])

with open("ARMSportKeywords.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(sportdf['keywords'])

ARMSportdata = pd.read_csv('ARMSportKeywords.csv', header = None)

sport_list=pd.Series([])
for col in ARMSportdata:
    sport_list = sport_list.append(ARMSportdata[col].dropna())

y = sport_list.value_counts().head(50).to_frame()

trans = []
for i in range(0, 511):
    trans.append([str(ARMSportdata.values[i,j]) for j in range(0, 7)])

# conveting it into an numpy array
trans = np.array(trans)
te = TransactionEncoder()
data_encoded = te.fit_transform(trans)
data_encoded = pd.DataFrame(data_encoded, columns = te.columns_)

data_encoded = data_encoded.loc[:, y.index]

frequent_itemsets=apriori(data_encoded, min_support = 0.01, use_colnames = True)
rules_c = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.8)
rules_c.sort_values('confidence', ascending = False, inplace = True)
print("\n>>> Generate association rules using Confidence threshold")
print(rules_c.head(5))
rules_l = association_rules(frequent_itemsets, metric="lift", min_threshold=10)
rules_l.sort_values('lift', ascending = False, inplace = True)
print("\n>>> Generate association rules using Lift threshold")
print(rules_l.head(5))

print("\n>>> Generate association rules using both Lift and Confidence threshold")
print(rules_c[ (rules_c['lift'] >= 11) &
       (rules_c['confidence'] >= 0.5) ].head(5))

  sport_list=pd.Series([])



>>> Generate association rules using Confidence threshold
               antecedents consequents  antecedent support  \
4              (six, wale)    (nation)            0.013699   
2           (six, ireland)    (nation)            0.017613   
5  (wale, france, ireland)   (england)            0.015656   
1        (nation, england)       (six)            0.023483   
0        (france, ireland)   (england)            0.029354   

   consequent support   support  confidence       lift  leverage  conviction  
4            0.052838  0.013699    1.000000  18.925926  0.012975         inf  
2            0.052838  0.015656    0.888889  16.823045  0.014725    8.524462  
5            0.152642  0.013699    0.875000   5.732372  0.011309    6.778865  
1            0.060665  0.019569    0.833333  13.736559  0.018145    5.636008  
0            0.152642  0.023483    0.800000   5.241026  0.019003    4.236791  

>>> Generate association rules using Lift threshold
          antecedents     consequents  an

In [31]:
entertainmentdf = pd.DataFrame(dat.loc[dat['category'] == "entertainment"])

with open("ARMEntertainKeywords.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(entertainmentdf['keywords'])

ARMEntertainmentdata = pd.read_csv('ARMEntertainKeywords.csv', header = None)

entertainment_list=pd.Series([])
for col in ARMEntertainmentdata:
    entertainment_list = entertainment_list.append(ARMEntertainmentdata[col].dropna())

y = entertainment_list.value_counts().head(50).to_frame()

trans = []
for i in range(0, 386):
    trans.append([str(ARMEntertainmentdata.values[i,j]) for j in range(0, 7)])

# conveting it into an numpy array
trans = np.array(trans)
te = TransactionEncoder()
data_encoded = te.fit_transform(trans)
data_encoded = pd.DataFrame(data_encoded, columns = te.columns_)

data_encoded = data_encoded.loc[:, y.index]

frequent_itemsets=apriori(data_encoded, min_support = 0.01, use_colnames = True)
rules_c = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.8)
rules_c.sort_values('confidence', ascending = False, inplace = True)
print("\n>>> Generate association rules using Confidence threshold")
print(rules_c.head(5))
rules_l = association_rules(frequent_itemsets, metric="lift", min_threshold=19)
rules_l.sort_values('lift', ascending = False, inplace = True)
print("\n>>> Generate association rules using Lift threshold")
print(rules_l.head(5))

print("\n>>> Generate association rules using both Lift and Confidence threshold")
print(rules_c[ (rules_c['lift'] >= 15) &
       (rules_c['confidence'] >= 0.35) ].head(5))

  entertainment_list=pd.Series([])



>>> Generate association rules using Confidence threshold
                  antecedents consequents  antecedent support  \
139    (role, british, actor)      (film)            0.010363   
116  (film, included, winner)     (award)            0.012953   
188  (actor, dollar, charles)  (director)            0.010363   
83          (weekend, taking)    (office)            0.015544   
163   (film, release, taking)    (office)            0.012953   

     consequent support   support  confidence       lift  leverage  conviction  
139            0.393782  0.010363         1.0   2.539474  0.006282         inf  
116            0.238342  0.012953         1.0   4.195652  0.009866         inf  
188            0.111399  0.010363         1.0   8.976744  0.009208         inf  
83             0.062176  0.015544         1.0  16.083333  0.014578         inf  
163            0.062176  0.012953         1.0  16.083333  0.012148         inf  

>>> Generate association rules using Lift threshold
           

In [62]:
politicsdf = pd.DataFrame(dat.loc[dat['category'] == "politics"])

with open("ARMPoliticsKeywords.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(politicsdf['keywords'])

ARMPoliticsdata = pd.read_csv('ARMPoliticsKeywords.csv', header = None)

politics_list=pd.Series([])
for col in ARMPoliticsdata:
    politics_list = politics_list.append(ARMPoliticsdata[col].dropna())

y = politics_list.value_counts().head(50).to_frame()

trans = []
for i in range(0, 417):
    trans.append([str(ARMPoliticsdata.values[i,j]) for j in range(0, 7)])

# conveting it into an numpy array
trans = np.array(trans)
te = TransactionEncoder()
data_encoded = te.fit_transform(trans)
data_encoded = pd.DataFrame(data_encoded, columns = te.columns_)

data_encoded = data_encoded.loc[:, y.index]

frequent_itemsets=apriori(data_encoded, min_support = 0.01, use_colnames = True)
rules_c = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.8)
print("\n>>> Generate association rules using Confidence threshold")
rules_c.sort_values('confidence', ascending = False, inplace = True)
print(rules_c.head(5))
rules_l = association_rules(frequent_itemsets, metric="lift", min_threshold=22)
rules_l.sort_values('lift', ascending = False, inplace = True)
print("\n>>> Generate association rules using Lift threshold")
print(rules_l.head(5))

print("\n>>> Generate association rules using both Lift and Confidence threshold")
print(rules_c[ (rules_c['lift'] >= 15) &
       (rules_c['confidence'] >= 0.5) ].head(5))

  politics_list=pd.Series([])



>>> Generate association rules using Confidence threshold
                             antecedents consequents  antecedent support  \
107                (cut, spending, tory)       (tax)            0.014388   
91     (prime minister, brown, election)     (prime)            0.011990   
26                   (blair, chancellor)     (brown)            0.035971   
57              (prime minister, leader)     (prime)            0.011990   
93   (prime minister, blair, chancellor)     (brown)            0.019185   

     consequent support   support  confidence       lift  leverage  conviction  
107            0.107914  0.014388         1.0   9.266667  0.012836         inf  
91             0.091127  0.011990         1.0  10.973684  0.010898         inf  
26             0.134293  0.035971         1.0   7.446429  0.031141         inf  
57             0.091127  0.011990         1.0  10.973684  0.010898         inf  
93             0.134293  0.019185         1.0   7.446429  0.016608         inf 

## 3. Classification Modeling

### 3.1 Sample the data

In [33]:
X = text_counts.toarray()
y = dat['category'].map( {'tech': 0, 'business': 1, 'sport': 2, 'entertainment': 3, 'politics': 4} ).astype(int)

In [34]:
# Split both Inputs (X) and Ouput (y) into training set (70%) and testing set (30%) 
# Set the random state as 2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)

In [35]:
X2 = text_tfidf.toarray()
y = dat['category'].map( {'tech': 0, 'business': 1, 'sport': 2, 'entertainment': 3, 'politics': 4} ).astype(int)

In [36]:
# Split both Inputs (X) and Ouput (y) into training set (70%) and testing set (30%)
# Set the random state as 2
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y, test_size=0.3, random_state=2)

### 3.2 Build the Model(s)

In [37]:
scores = cross_val_score(LogisticRegression(solver="lbfgs", max_iter=1000), X_train, y_train, cv=15)
print("Mean cross-validation accuracy: {:.3f}".format(np.mean(scores)))

param_grid = {'C': [0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(solver="lbfgs", max_iter=1000), param_grid, cv=15)
grid.fit(X_train, y_train)
print("Best cross-validation score: {:.3f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

Mean cross-validation accuracy: 0.940
Best cross-validation score: 0.941
Best parameters:  {'C': 0.1}


In [38]:
print("Test score: {:.3f}".format(grid.score(X_test, y_test)))

Test score: 0.946


In [39]:
scores = cross_val_score(LogisticRegression(solver='lbfgs', max_iter=1000), X2_train, y2_train, cv=5)
print("Mean cross-validation accuracy: {:.3f}".format(np.mean(scores)))

param_grid = {'C': [0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(solver="lbfgs", max_iter=1000), param_grid, cv=5)
grid.fit(X2_train, y2_train)
print("Best cross-validation score: {:.3f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

Mean cross-validation accuracy: 0.944
Best cross-validation score: 0.944
Best parameters:  {'C': 10}


In [40]:
print("Test score: {:.3f}".format(grid.score(X2_test, y2_test)))

Test score: 0.948


In [41]:
rf = RandomForestClassifier(max_samples=1000)
scores = cross_val_score(rf, X_train, y_train, cv=5)
print("Mean cross-validation accuracy: {:.3f}".format(np.mean(scores)))
param = {'n_estimators': [10, 20, 30],
        'max_depth': [30, None]}
grid = GridSearchCV(rf, param, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation score: {:.3f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

Mean cross-validation accuracy: 0.938
Best cross-validation score: 0.933
Best parameters:  {'max_depth': 30, 'n_estimators': 30}


In [42]:
print("Test score: {:.3f}".format(grid.score(X_test, y_test)))

Test score: 0.934


In [43]:
rf = RandomForestClassifier(max_samples=900)
scores = cross_val_score(rf, X2_train, y2_train, cv=5)
print("Mean cross-validation accuracy: {:.3f}".format(np.mean(scores)))
param = {'n_estimators': [10, 20, 30],
        'max_depth': [None, 30]}
grid = GridSearchCV(rf, param, cv=5, n_jobs=-1)
grid.fit(X2_train, y2_train)
print("Best cross-validation score: {:.3f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

Mean cross-validation accuracy: 0.934
Best cross-validation score: 0.930
Best parameters:  {'max_depth': None, 'n_estimators': 30}


In [44]:
print("Test score: {:.3f}".format(grid.score(X2_test, y2_test)))

Test score: 0.924


In [45]:
clf = MultinomialNB()
scores = cross_val_score(clf, X_train, y_train, cv=5)
print("Mean cross-validation accuracy: {:.3f}".format(np.mean(scores)))
param = {'alpha': [1, 2, 5, 10]}
grid = GridSearchCV(clf, param, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation score: {:.3f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

Mean cross-validation accuracy: 0.945
Best cross-validation score: 0.945
Best parameters:  {'alpha': 10}


In [46]:
print("Test score: {:.3f}".format(grid.score(X_test, y_test)))

Test score: 0.942


In [47]:
clf = MultinomialNB()
scores = cross_val_score(clf, X2_train, y2_train, cv=5)
print("Mean cross-validation accuracy: {:.3f}".format(np.mean(scores)))
param = {'alpha': [1, 2, 5, 10]}
grid = GridSearchCV(clf, param, cv=5)
grid.fit(X2_train, y2_train)
print("Best cross-validation score: {:.3f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

Mean cross-validation accuracy: 0.944
Best cross-validation score: 0.944
Best parameters:  {'alpha': 1}


In [48]:
print("Test score: {:.3f}".format(grid.score(X2_test, y2_test)))

Test score: 0.943


### 3.3 Evaluate and Improve the Model(s)

In [49]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y, test_size=0.35, random_state=2)

scores = cross_val_score(LogisticRegression(solver='lbfgs', max_iter=1000), X2_train, y2_train, cv=20)
print("Mean cross-validation accuracy: {:.3f}".format(np.mean(scores)))
param_grid = {'C': [1, 10, 30, 50]}
grid = HalvingGridSearchCV(LogisticRegression(solver="lbfgs", max_iter=1000), param_grid, random_state=2, cv=20).fit(X2_train, y2_train)
print("Best cross-validation score: {:.3f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

print("Test score: {:.3f}".format(grid.score(X2_test, y2_test)))

Mean cross-validation accuracy: 0.946
Best cross-validation score: 0.947
Best parameters:  {'C': 10}
Test score: 0.953


In [50]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y, test_size=0.35, random_state=2)

rf = RandomForestClassifier(max_samples=1000)
scores = cross_val_score(rf, X2_train, y2_train, cv=25)
print("Mean cross-validation accuracy: {:.3f}".format(np.mean(scores)))
param = {'n_estimators': [100, 300, 700],
        'max_depth': [5, 30, 70, None]}
grid = GridSearchCV(rf, param, cv=25, n_jobs=-1)
grid.fit(X2_train, y2_train)
print("Best cross-validation score: {:.3f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

print("Test score: {:.3f}".format(grid.score(X2_test, y2_test)))

Mean cross-validation accuracy: 0.934
Best cross-validation score: 0.940
Best parameters:  {'max_depth': None, 'n_estimators': 300}
Test score: 0.947


In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=4)

clf = MultinomialNB()
scores = cross_val_score(clf, X_train, y_train, cv=20)
print("Mean cross-validation accuracy: {:.3f}".format(np.mean(scores)))
param = {'alpha': [0.1, 1, 2, 5, 10]}
grid = GridSearchCV(clf, param, cv=20)
grid.fit(X_train, y_train)
print("Best cross-validation score: {:.3f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

print("Test score: {:.3f}".format(grid.score(X_test, y_test)))

Mean cross-validation accuracy: 0.947
Best cross-validation score: 0.948
Best parameters:  {'alpha': 0.1}
Test score: 0.953


### 3.4 Final improvement

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=2)

clf = MultinomialNB()
scores = cross_val_score(clf, X_train, y_train, cv=100)
print("Mean cross-validation accuracy: {:.3f}".format(np.mean(scores)))
param = {'alpha': [0.1, 1, 10, 50]}
grid = HalvingGridSearchCV(clf, param, cv=100).fit(X_train, y_train)
print("Best cross-validation score: {:.3f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

print("Test score: {:.3f}".format(grid.score(X_test, y_test)))

Mean cross-validation accuracy: 0.945
Best cross-validation score: 0.947
Best parameters:  {'alpha': 0.1}
Test score: 0.942
