In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import matplotlib.pyplot as plt # data visualization library
%matplotlib inline
import seaborn as sns

import re
import nltk

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import accuracy_score, f1_score, average_precision_score, recall_score


from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer #word stemmer class
lemma = WordNetLemmatizer()
from wordcloud import WordCloud, STOPWORDS
from nltk import FreqDist

# 1. Load the dataset 

In [None]:
#select 5000 rows
df = pd.read_csv('/kaggle/input/vehicle/vehicle.csv', nrows= 5000)
df.sample(5)

# 2. Preprocess rows of the “text” column 
#### a. Remove unwanted characters
#### b. Convert text to lowercase
#### c. Remove unwanted spaces
#### d. Remove stopwords

In [None]:
words = set(nltk.corpus.words.words())

In [None]:
def normalizer(blogs):
    blogs = " ".join(filter(lambda x: x[0]!= '@' , blogs.split()))
    blogs = re.sub('[^a-zA-Z]', ' ', blogs)
    blogs = blogs.lower()
    blogs = re.sub(' +', ' ', blogs).strip()
    blogs = blogs.split()
    blogs = [word for word in blogs if not word in set(stopwords.words('english'))]
    blogs = [lemma.lemmatize(word) for word in blogs]
    
    blogs = " ".join(blogs)
    return blogs

In [None]:
df['normalized_text'] = df.text.apply(normalizer)

In [None]:
df.head()

In [None]:
# Remove Non-English Words from Normalized text
def remove_non_english_words(blog):
    return " ".join(w for w in nltk.wordpunct_tokenize(blog) if w.lower() in words or not w.isalpha())

df['normalized_text'] = df.normalized_text.apply(remove_non_english_words)

In [None]:
df.head()

####  Word Cloud of all the normlized text

In [None]:
# all tweets 
all_words = " ".join(df.normalized_text)

In [None]:
wordcloud = WordCloud(height=2000, width=2000, stopwords=STOPWORDS, background_color='white')
wordcloud = wordcloud.generate(all_words)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

# 3. As we want to make this into a multi-label classification problem, you are required to merge all the label columns together, so that we have all the labels together for a particular sentence (7.5 points)

#### a. Label columns to merge: “gender”, “age”, “topic”, “sign”

In [None]:
##  create another dataframe dfT having only the columns needed for creating label
dfT=df[['gender', 'age', 'topic', 'sign']]

In [None]:
## Convert age from int type into String
dfT['age']=dfT['age'].astype('str')

In [None]:
## Create a 2D Matrix 'm' which is list of list contaning 'gender', 'age', 'topic', 'sign' for each row
m=[]                              # 2D Matrix having list of list
for i in range(dfT.shape[0]):
    g=[]                          # 1D list of 'gender', 'age', 'topic', 'sign'
    for j in range(dfT.shape[1]):
        g.append(dfT.iloc[i][j])
    m.append(g)

In [None]:
#Add a column called labels
df['labels']=m

In [None]:
df.head()

#### b. After completing the previous step, there should be only two columns in your data frame i.e. “text” and “labels” as shown in the below image

In [None]:
final_df = df[['normalized_text', 'labels']]

In [None]:
final_df.head()

In [None]:
# Lets Check Distribution of Labels
final_df['labels'].astype('str').value_counts()

In [None]:
## Check for Null Values
final_df.isna().sum()

In [None]:
# No Null Values

# 4. Separate features and labels, and split the data into training and testing 

In [None]:
X = final_df['normalized_text']
y = final_df['labels']

In [None]:
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.25)

# 5. Vectorize the features 
#### a. Create a Bag of Words using count vectorizer
#### i. Use ngram_range=(1, 2)
##### ii. Vectorize training and testing features


In [None]:
# Consider only those rows which occur more than 15% and less than 80 %, also restrict features to 100

vectorizer = CountVectorizer(ngram_range = (1,2), stop_words=stopwords.words('english'), 
                             min_df = 0.15, max_df = 0.8, max_features = 100)

In [None]:
# transform the X data to document_term_matrix

X_train_dtm = vectorizer.fit_transform(X_train)
X_test_dtm = vectorizer.transform(X_test)
X_train_dtm

In [None]:
# check the vocabulary( First 15 features)
vectorizer.get_feature_names()[:10]

##### b. Print the term-document matrix

##### Train Document Term Matrix

In [None]:

print(X_train_dtm )

In [None]:
# examine vocabulary and document term matrix together
pd.DataFrame(X_train_dtm.toarray(), columns = vectorizer.get_feature_names())


##### Test Document Term Matrix

In [None]:

print(X_train_dtm )

In [None]:
# examine vocabulary and document term matrix together
pd.DataFrame(X_test_dtm.toarray(), columns = vectorizer.get_feature_names())

# Create a dictionary to get the count of every label i.e. the key will be label name and value will be the total count of the label. Check below image for reference (5 points)

In [None]:
dfT = df[['gender', 'age', 'topic', 'sign']]

In [None]:
dfT['age'] = dfT['age'].astype('str')

In [None]:
keys=[] 
values=[] 

for i in range(dfT.shape[1]): # iterate through all the colummns        
    for j in range(dfT.iloc[:,i].value_counts().shape[0]): # iterate through all the rows of value_counts of that column
        keys.append(dfT.iloc[:,i].value_counts().index[j])         
        values.append(dfT.iloc[:,i].value_counts().iloc[j])

In [None]:
dictionary = dict(zip(keys,values))

In [None]:
print(dictionary)

# 7. Transform the labels - (7.5 points)
As we have noticed before, in this task each example can have multiple tags. To deal with
such kind of prediction, we need to transform labels in a binary form and the prediction will be
a mask of 0s and 1s. For this purpose, it is convenient to use MultiLabelBinarizer from sklearn

#### a. Convert your train and test labels using MultiLabelBinarizer

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer 
mlb = MultiLabelBinarizer(classes=sorted(dictionary.keys()))
y_train_mlb = mlb.fit_transform(y_train)
y_test_mlb = mlb.transform(y_test)

In [None]:
y_train_mlb[0]

In [None]:
y_test_mlb[0]

##### Lets verify one single row of train set after MLB conversion

In [None]:
y_train.iloc[1]

In [None]:
mlb.inverse_transform(y_train_mlb)[1]

##### Result as expected

# 8. Choose a classifier 


In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='lbfgs', multi_class='ovr')
ovr = OneVsRestClassifier(lr)

ovr.fit(X_train_dtm, y_train_mlb)
y_pred_ovr_test = ovr.predict(X_test_dtm)
#y_proba_ovr = ovr.predict_proba(X_test_dtm)
y_pred_ovr_test

In [None]:
y_pred_ovr_train = ovr.predict(X_train_dtm)
y_pred_ovr_train

# 9. Fit the classifier, make predictions and get the accuracy 


In [None]:
def print_scores(actual, predicted, averaging_type):
    print('\nAVERAGING TYPE==> ',averaging_type)
    print('F1 score: ',f1_score(actual,predicted, average=averaging_type))
    print('Average Precision Score: ',average_precision_score(actual,predicted, average=averaging_type))
    print('Average Recall Score: ',recall_score(actual,predicted, average=averaging_type))

##### Train Score

In [None]:
print('--------------------------TRAIN SCORES--------------------------------')
print('Accuracy score: ',accuracy_score(y_train_mlb, y_pred_ovr_train))
print_scores(y_train_mlb, y_pred_ovr_train, 'micro')
print_scores(y_train_mlb, y_pred_ovr_train, 'macro')
print_scores(y_train_mlb, y_pred_ovr_train, 'weighted')

##### Test Scores

In [None]:
print('--------------------------TEST SCORES--------------------------------')
print('Accuracy score: ',accuracy_score(y_test_mlb, y_pred_ovr_test))
print_scores(y_test_mlb, y_pred_ovr_test, 'micro')
print_scores(y_test_mlb, y_pred_ovr_test, 'macro')
print_scores(y_test_mlb, y_pred_ovr_test, 'weighted')

# 10. Print true label and predicted label for any five examples

In [None]:
five_pred = y_pred_ovr_test[:5]
five_actual = y_test_mlb[:5]

In [None]:
five_actual = mlb.inverse_transform(five_actual)
five_actual

In [None]:
five_pred = mlb.inverse_transform(five_pred)
five_pred