In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re # regular expression
import nltk

In [2]:
from google.colab import files

!pip install -q kaggle

In [3]:
uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [4]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle
# Change the permission
!chmod 600 ~/.kaggle/kaggle.json   

In [5]:
!kaggle datasets download -d rtatman/blog-authorship-corpus

Downloading blog-authorship-corpus.zip to /content
 94% 273M/290M [00:02<00:00, 107MB/s]
100% 290M/290M [00:02<00:00, 115MB/s]


In [6]:
from zipfile import ZipFile
file_name = '/content/blog-authorship-corpus.zip'
   
with ZipFile(file_name,'r') as zip:
  zip.extractall()
  print('Done')

Done


# 1. Load the dataset (5 points)
a. Tip: As the dataset is large, use fewer rows. Check what is working well on your
machine and decide accordingly.

In [88]:
df=pd.read_csv("blogtext.csv",nrows=10000)
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [89]:
df.shape

(10000, 7)

In [90]:
df.isnull().sum()

id        0
gender    0
age       0
topic     0
sign      0
date      0
text      0
dtype: int64

In [91]:
df.drop(['id','date'], axis=1, inplace=True)

In [92]:
df.head()

Unnamed: 0,gender,age,topic,sign,text
0,male,15,Student,Leo,"Info has been found (+/- 100 pages,..."
1,male,15,Student,Leo,These are the team members: Drewe...
2,male,15,Student,Leo,In het kader van kernfusie op aarde...
3,male,15,Student,Leo,testing!!! testing!!!
4,male,33,InvestmentBanking,Aquarius,Thanks to Yahoo!'s Toolbar I can ...


In [93]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   gender  10000 non-null  object
 1   age     10000 non-null  int64 
 2   topic   10000 non-null  object
 3   sign    10000 non-null  object
 4   text    10000 non-null  object
dtypes: int64(1), object(4)
memory usage: 390.8+ KB


In [94]:
df['age']=df['age'].astype(object)

In [95]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   gender  10000 non-null  object
 1   age     10000 non-null  object
 2   topic   10000 non-null  object
 3   sign    10000 non-null  object
 4   text    10000 non-null  object
dtypes: object(5)
memory usage: 390.8+ KB


# 2. Preprocess rows of the “text” column (7.5 points)
+ Remove unwanted characters
+ Convert text to lowercase
+ Remove unwanted spaces
+ Remove stopwords

In [96]:
#from nltk.stem import PorterStemmer
#stemmer = PorterStemmer()
df['clean_data']=df['text'].apply(lambda x: re.sub(r'[^A-Za-z]+',' ',x))
df['clean_data']=df['clean_data'].apply(lambda x: x.lower())
df['clean_data']=df['clean_data'].apply(lambda x: x.strip())
#df['clean_data']=df['clean_data'].apply(lambda x: stemmer.stem(x))
df.head()

Unnamed: 0,gender,age,topic,sign,text,clean_data
0,male,15,Student,Leo,"Info has been found (+/- 100 pages,...",info has been found pages and mb of pdf files ...
1,male,15,Student,Leo,These are the team members: Drewe...,these are the team members drewes van der laag...
2,male,15,Student,Leo,In het kader van kernfusie op aarde...,in het kader van kernfusie op aarde maak je ei...
3,male,15,Student,Leo,testing!!! testing!!!,testing testing
4,male,33,InvestmentBanking,Aquarius,Thanks to Yahoo!'s Toolbar I can ...,thanks to yahoo s toolbar i can now capture th...


In [97]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [98]:
from nltk.corpus import stopwords
stopwords=set(stopwords.words('english'))
df['clean_data']=df['clean_data'].apply(lambda x: ' '.join([words for words in x.split() if words not in stopwords]))
df.head()

Unnamed: 0,gender,age,topic,sign,text,clean_data
0,male,15,Student,Leo,"Info has been found (+/- 100 pages,...",info found pages mb pdf files wait untill team...
1,male,15,Student,Leo,These are the team members: Drewe...,team members drewes van der laag urllink mail ...
2,male,15,Student,Leo,In het kader van kernfusie op aarde...,het kader van kernfusie op aarde maak je eigen...
3,male,15,Student,Leo,testing!!! testing!!!,testing testing
4,male,33,InvestmentBanking,Aquarius,Thanks to Yahoo!'s Toolbar I can ...,thanks yahoo toolbar capture urls popups means...


# 3. As we want to make this into a multi-label classification problem, you are required to merge all the label columns together, so that we have all the labels together for a particular sentence (7.5 points)
+ Label columns to merge: “gender”, “age”, “topic”, “sign”
+ After completing the previous step, there should be only two columns in your data
frame i.e. “text” and “labels” as shown in the below image

In [99]:
df['labels']=df.apply(lambda col: [col['gender'],str(col['age']),col['topic'],col['sign']], axis=1)
df.head()

Unnamed: 0,gender,age,topic,sign,text,clean_data,labels
0,male,15,Student,Leo,"Info has been found (+/- 100 pages,...",info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,male,15,Student,Leo,These are the team members: Drewe...,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,male,15,Student,Leo,In het kader van kernfusie op aarde...,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,male,15,Student,Leo,testing!!! testing!!!,testing testing,"[male, 15, Student, Leo]"
4,male,33,InvestmentBanking,Aquarius,Thanks to Yahoo!'s Toolbar I can ...,thanks yahoo toolbar capture urls popups means...,"[male, 33, InvestmentBanking, Aquarius]"


In [100]:
df.drop(['gender','age','topic','sign','text'], axis=1, inplace=True)
df.head()

Unnamed: 0,clean_data,labels
0,info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,testing testing,"[male, 15, Student, Leo]"
4,thanks yahoo toolbar capture urls popups means...,"[male, 33, InvestmentBanking, Aquarius]"


# 6. Create a dictionary to get the count of every label i.e. the key will be label name and value will be the total count of the label. Check below image for reference (5 points)

In [101]:
label_counts=dict()

for labels in df.labels.values:
    for label in labels:
        if label in label_counts:
            label_counts[label]+=1
        else:
            label_counts[label]=1

In [102]:
label_counts

{'13': 42,
 '14': 212,
 '15': 602,
 '16': 440,
 '17': 1185,
 '23': 253,
 '24': 655,
 '25': 386,
 '26': 234,
 '27': 1054,
 '33': 136,
 '34': 553,
 '35': 2315,
 '36': 1708,
 '37': 33,
 '38': 46,
 '39': 79,
 '40': 1,
 '41': 20,
 '42': 14,
 '43': 6,
 '44': 3,
 '45': 16,
 '46': 7,
 'Accounting': 4,
 'Aquarius': 571,
 'Aries': 4198,
 'Arts': 45,
 'Automotive': 14,
 'Banking': 16,
 'BusinessServices': 91,
 'Cancer': 504,
 'Capricorn': 215,
 'Communications-Media': 99,
 'Consulting': 21,
 'Education': 270,
 'Engineering': 127,
 'Fashion': 1622,
 'Gemini': 150,
 'HumanResources': 2,
 'Internet': 118,
 'InvestmentBanking': 70,
 'Law': 11,
 'LawEnforcement-Security': 10,
 'Leo': 301,
 'Libra': 491,
 'Marketing': 156,
 'Museums-Libraries': 17,
 'Non-Profit': 71,
 'Pisces': 454,
 'Publishing': 4,
 'Religion': 9,
 'Sagittarius': 1097,
 'Science': 63,
 'Scorpio': 971,
 'Sports-Recreation': 80,
 'Student': 1137,
 'Taurus': 812,
 'Technology': 2654,
 'Telecommunications': 2,
 'Virgo': 236,
 'female': 4

# Transform the labels - (7.5 points)
As we have noticed before, in this task each example can have multiple tags. To deal with
such kind of prediction, we need to transform labels in a binary form and the prediction will be
+ mask of 0s and 1s. For this purpose, it is convenient to use MultiLabelBinarizer from sklearn
+ Convert your train and test labels using MultiLabelBinarizer

In [103]:
from sklearn.preprocessing import MultiLabelBinarizer

In [104]:
mlb = MultiLabelBinarizer(classes=sorted(label_counts.keys()))

In [105]:
Y=mlb.fit_transform(df.labels)
Y

array([[0, 0, 1, ..., 0, 0, 1],
       [0, 0, 1, ..., 0, 0, 1],
       [0, 0, 1, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0]])

In [106]:
# Just for testing purpose 
mlb.inverse_transform(Y)[0]

('15', 'Leo', 'Student', 'male')

In [107]:
from pprint import pprint 
pprint(list(mlb.classes_),compact=True)

['13', '14', '15', '16', '17', '23', '24', '25', '26', '27', '33', '34', '35',
 '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', 'Accounting',
 'Aquarius', 'Aries', 'Arts', 'Automotive', 'Banking', 'BusinessServices',
 'Cancer', 'Capricorn', 'Communications-Media', 'Consulting', 'Education',
 'Engineering', 'Fashion', 'Gemini', 'HumanResources', 'Internet',
 'InvestmentBanking', 'Law', 'LawEnforcement-Security', 'Leo', 'Libra',
 'Marketing', 'Museums-Libraries', 'Non-Profit', 'Pisces', 'Publishing',
 'Religion', 'Sagittarius', 'Science', 'Scorpio', 'Sports-Recreation',
 'Student', 'Taurus', 'Technology', 'Telecommunications', 'Virgo', 'female',
 'indUnk', 'male']


# 4. Separate features and labels, and split the data into training and testing (5 points)

In [108]:
X=df['clean_data']

In [109]:
from sklearn.model_selection import train_test_split
Xtrain,Xtest,Ytrain,Ytest=train_test_split(X,Y,test_size=0.2)
Xtrain.shape,Xtest.shape

((8000,), (2000,))

In [110]:
x_train = Xtrain.copy()
x_test = Xtest.copy()

# Vectorize the features (5 points)
+ Create a Bag of Words using count vectorizer
  * Use ngram_range=(1, 2)
  * Vectorize training and testing features
+ Print the term-document matrix

In [111]:
from sklearn.feature_extraction.text import CountVectorizer
tvect_ngram = CountVectorizer(binary=True,ngram_range=(1,2))

In [112]:
tvect_ngram.fit(Xtrain)
len(tvect_ngram.vocabulary_)

536768

In [113]:
tvect_ngram.get_feature_names()[:5]

['aa', 'aa anger', 'aa keeps', 'aa nice', 'aa sd']

In [114]:
X_train_ct = tvect_ngram.transform(Xtrain)

In [115]:
#Size of Document Term Matrix
X_train_ct.shape

(8000, 536768)

In [116]:
X_test_ct = tvect_ngram.transform(Xtest)

In [117]:
X_test_ct.shape

(2000, 536768)

In [118]:
tvect_ngram.vocabulary_['angry']

16239

In [119]:
list(x_train)[0]

'worship wal mart thursdays always awesome today especially started morning heavy heart weary worries headed women bible study slapped happy face today different since actually study gathering rest women groups worshipping fellowshiping brunch sharing god past six weeks worship time incredible really special group women passionately love god sang song new refrain kept talking bow god throne kings lay crowns instant god reminded worries worth heart lifted laid breathed sigh relief burden lifted remembered carry alone easy get caught crazy life lose sight really matters also great time fellowship two gals age three us knew traded small talk week never really talked hung morning event enjoyed getting surface little bit transplanted times could see eyes desperate someone connect beautiful thing wonderful gathering headed super wal mart grocery shopping love hate relationship wal mart expound sometime today love day breezed groceries got line oh lines realized forgotten pick things missiona

In [120]:
type(X_train_ct)

scipy.sparse.csr.csr_matrix

In [121]:
print(X_train_ct[15])

  (0, 5640)	1
  (0, 5659)	1
  (0, 7817)	1
  (0, 7876)	1
  (0, 46215)	1
  (0, 46410)	1
  (0, 87307)	1
  (0, 87355)	1
  (0, 104526)	1
  (0, 104544)	1
  (0, 113415)	1
  (0, 113422)	1
  (0, 123918)	1
  (0, 123936)	1
  (0, 139905)	1
  (0, 139908)	1
  (0, 140018)	1
  (0, 140201)	1
  (0, 152410)	1
  (0, 163849)	1
  (0, 163852)	1
  (0, 175448)	1
  (0, 175527)	1
  (0, 175608)	1
  (0, 198111)	1
  :	:
  (0, 368412)	1
  (0, 368428)	1
  (0, 368459)	1
  (0, 407062)	1
  (0, 407068)	1
  (0, 409360)	1
  (0, 409361)	1
  (0, 424787)	1
  (0, 424795)	1
  (0, 425348)	1
  (0, 425388)	1
  (0, 425394)	1
  (0, 425403)	1
  (0, 438258)	1
  (0, 438442)	1
  (0, 449837)	1
  (0, 449878)	1
  (0, 489562)	1
  (0, 489563)	1
  (0, 491556)	1
  (0, 491571)	1
  (0, 518862)	1
  (0, 518905)	1
  (0, 527605)	1
  (0, 527994)	1


# 8. Choose a classifier - (5 points)
In this task, we suggest using the One-vs-Rest approach, which is implemented in
OneVsRestClassifier class. In this approach k classifiers (= number of tags) are trained. As a
basic classifier, use LogisticRegression. It is one of the simplest methods, but often it
performs good enough in text classification tasks. It might take some time because the
number of classifiers to train is large.
+ Use a linear classifier of your choice, wrap it up in OneVsRestClassifier to train it on
every label
+ As One-vs-Rest approach might not have been discussed in the sessions, we are
providing you the code for that

In [122]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression(solver='lbfgs')

In [123]:
from sklearn.multiclass import OneVsRestClassifier
model=OneVsRestClassifier(model)

In [124]:
model.fit(X_train_ct,Ytrain)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

# 9. Fit the classifier, make predictions and get the accuracy (5 points)
+ a. Print the following
+ i. Accuracy score
+ ii. F1 score
+ iii. Average precision score
+ iv. Average recall score + v. Tip: Make sure you are familiar with all of them. How would you expect the things to work for the multi-label scenario? Read about micro/macro/weighted averaging

In [125]:
Ypred=model.predict(X_test_ct)

In [126]:
Ypred_inversed = mlb.inverse_transform(Ypred)
y_test_inversed = mlb.inverse_transform(Ytest)

In [127]:
Ypred_inversed

[('male',),
 ('16', 'Cancer', 'male'),
 ('Aries', 'male'),
 ('35', 'Aries', 'Technology', 'male'),
 ('female',),
 ('15', 'Libra', 'Student', 'female'),
 ('Aries', 'female'),
 ('35', 'Aries', 'Technology', 'male'),
 ('35', 'Aries', 'Technology', 'male'),
 ('15', 'Libra', 'Student', 'female'),
 ('Student', 'female'),
 ('36', 'Aries', 'Fashion', 'male'),
 ('17', 'male'),
 ('17', 'Sagittarius', 'female'),
 ('female',),
 ('Scorpio', 'female'),
 ('25', 'female'),
 ('34', 'Sagittarius', 'female', 'indUnk'),
 ('male',),
 ('Capricorn', 'Sports-Recreation', 'male'),
 ('36', 'Aries', 'Fashion', 'male'),
 ('35', 'Aries', 'male'),
 ('female', 'indUnk'),
 ('35', 'Aries', 'Technology', 'male'),
 ('35', 'Aries', 'Technology', 'male'),
 ('36', 'Aries', 'Fashion', 'male'),
 ('35', 'Aries', 'Technology', 'male'),
 ('35', 'Aries', 'Technology', 'male'),
 ('male',),
 ('35', 'Aries', 'Technology', 'male'),
 ('Aries', 'male'),
 ('34', 'Sagittarius', 'female', 'indUnk'),
 ('male',),
 ('male',),
 ('35', 'Aries

In [137]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

def print_evaluation_scores(Ytest, Ypred):
    print('Accuracy score: ', accuracy_score(Ytest, Ypred))
    print('F1 score: ', f1_score(Ytest, Ypred, average='micro'))
    print('Average precision score: ', average_precision_score(Ytest, Ypred, average='micro'))
    print('Average recall score: ', recall_score(Ytest, Ypred, average='micro'))

In [138]:
print_evaluation_scores(Ytest, Ypred)

Accuracy score:  0.309
F1 score:  0.6341500302480338
Average precision score:  0.4504267443529862
Average recall score:  0.524125
