### Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import stopwords
import string
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")

### Load dataset

In [None]:
data = pd.read_csv('blogtext.csv')

In [None]:
data.shape

## Due to system limitation taking only 100000 records

In [None]:
data = data[:100000]

In [None]:
data.shape

### Peek into the top 5 rows

In [None]:
data.head()

### Shape of the data

In [None]:
data.shape

In [None]:
data.describe()

### Average age is 24

### Check for Nulls

In [None]:
data.isnull().sum()

## There are no null values

In [None]:
sns.distplot(data['age'])

In [None]:
sns.countplot(x="age", hue="gender", data=data)

## There are 3 separate peeks in the dataset at 15-20, 21-27, 33-36

In [None]:
sns.countplot(data['topic'], order = data['topic'].value_counts().index)
plt.xticks(rotation=90)

In [None]:
data['topic'].value_counts()

In [None]:
sns.countplot(data['sign'], order = data['sign'].value_counts().index)
plt.xticks(rotation=90)

In [None]:
data['sign'].value_counts()

In [None]:
sns.countplot(data['gender'], order = data['gender'].value_counts().index)

In [None]:
data['gender'].value_counts()

In [None]:
data['date'].unique()

### Convert to lower and remove punctuation

In [None]:
def remove_spaces_lower(text):
    text = text.strip()
    return text.lower()

st_punc = string.punctuation
def remove_punctuation(text):
    return ''.join([w for w in text if w not in st_punc])

data['text'] = data['text'].apply(remove_spaces_lower)
data['text_without_punc'] = data['text'].apply(remove_punctuation)

In [None]:
data['text_without_punc'][3]

### Remove stopwords

In [None]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([w for w in text.split() if w not in stop_words])

data["text_without_stop"] = data["text_without_punc"].apply(remove_stopwords)
data.head()

## Display first record

In [None]:
data['text'][0]

In [None]:
data['text_without_punc'][0]

In [None]:
data['text_without_stop'][0]

## Form a combined Label for classification

In [None]:
data['age'] = data['age'].astype(str)
data['Labels'] = data.apply(lambda col : [col["gender"], col["age"],col["topic"],col["sign"]], axis =1)
data_final = data[['text_without_stop', 'Labels']]

In [None]:
data_final.head()

## Split into train and test

In [None]:
X = data_final['text_without_stop']
y = data_final['Labels']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0)

In [None]:
print('Training Set')
print(X_train.shape)
print(y_train.shape)
print('Testing Set')
print(X_test.shape)
print(y_test.shape)

## Count vectorizer

In [None]:
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), stop_words='english')
xtrain_ctv = tfidf.fit_transform(X_train)
xtest_ctv = tfidf.transform(X_test)
xtrain_ctv.shape, xtest_ctv.shape

## Create dictionary to get count of every label

In [None]:
label_counts=dict()

for labels in data_final.Labels.values:
    for label in labels:
        if label in label_counts:
            label_counts[str(label)]+=1
        else:
            label_counts[str(label)]=1
        
label_counts

## Transform Labels

In [None]:
binarizer=MultiLabelBinarizer(classes=sorted(label_counts.keys()))

y_train = binarizer.fit_transform(y_train)
y_test = binarizer.transform(y_test)

In [None]:
binarizer.classes_

## Model

In [None]:
LogReg_pipeline = OneVsRestClassifier(LogisticRegression(solver='sag'),n_jobs=-1)

LogReg_pipeline.fit(xtrain_ctv, y_train)

Y_predicted_oneVsRest = LogReg_pipeline.predict(xtest_ctv)

## Accuracy

In [None]:
print('F1 score: weighted', f1_score(y_test, Y_predicted_oneVsRest, average='weighted'))
print('Average precision score: weighted', average_precision_score(y_test, Y_predicted_oneVsRest, average='weighted'))
print('Average recall score: weighted', recall_score(y_test, Y_predicted_oneVsRest, average='weighted'))

## Check

In [None]:
print(classification_report(y_test, Y_predicted_oneVsRest))

In [None]:
for i in range(10):
    print(binarizer.inverse_transform(y_test)[i])
    print(binarizer.inverse_transform(Y_predicted_oneVsRest)[i])
    print('--------------')