In [1]:
import os
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))
#printmd('**bold**')

In [3]:
# skip this field if "combined_set.csv" exists
parsed_csv = 'combined_set.csv'

if not os.path.isfile(parsed_csv):
    feeds = pd.read_json('./feeds_2000.ndjson', lines=True)
    labels = pd.read_json('./labels_2000.ndjson', lines=True)

    # not all IDs are found in the tiny labels file, make sure we have a complete (albeit fake, dataset)
    import random
    valid_ids = list(feeds.id)  # the ids found in the feeds_2000
    random.shuffle(valid_ids)  # shuffle these and assign random IDs that exist 
    labels.id = valid_ids
    combined = pd.merge(feeds, labels, on='id')
    expanded = combined.set_index(
        ['id', 'birthyear', 'fame', 'gender', 'occupation']
    )['text'].apply(pd.Series).stack()
    expanded = expanded.reset_index()
    expanded = expanded.drop(columns=['level_5'])  # level_5 is the auto-generated new column, containing an index

    expanded.to_csv(parsed_csv)
else:
    print('file exists, continue!')

file exists, continue!


In [4]:
data_path = os.path.join(os.getcwd(), parsed_csv)

In [5]:
#data_raw = pd.read_csv(data_path)
data_raw = pd.read_csv(open(data_path,'r'), encoding='utf-8', engine='c')
data_raw.columns = ['index', 'id', 'birthyear', 'fame', 'gender', 'occupation', 'text']
data_raw = data_raw.drop(columns=['index', 'id'])  # no need for any non-label data

print(data_raw.shape)  # before dropping NaN values

data_raw = data_raw.dropna()
data_raw['birthyear'] = data_raw['birthyear'].astype(int)  # from 1978.0 -> 1978
print(data_raw.shape)


(5220005, 5)
(5219253, 5)


In [6]:
print("Number of rows in data =",data_raw.shape[0])
print("Number of columns in data =",data_raw.shape[1])
print("\n")
printmd("**Sample data:**")
data_raw.head(30)

Number of rows in data = 5219253
Number of columns in data = 5




**Sample data:**

Unnamed: 0,birthyear,fame,gender,occupation,text
0,1991,star,male,performer,Back at it with @americanidol looking for...he...
1,1991,star,male,performer,"Can’t buy all the happiness in the world, it’s..."
2,1991,star,male,performer,30 down @nytimes 🤝🧡
3,1991,star,male,performer,"📸 @ronyalwin 💘 @ New York, New York https://t...."
4,1991,star,male,performer,🎀 pink it was love at first sight 🎀 @ New York...
5,1991,star,male,performer,Putting my best foot forward in The Memphis by...
6,1991,star,male,performer,Girls UNITED can never be divided! 👯‍♀️❤️ http...
7,1991,star,male,performer,BRB buying The Stephanie bow shoe as an early ...
8,1991,star,male,performer,"Connect the dots, bbs #TheClara - TeamKP @kpc..."
9,1991,star,male,performer,Geometry was The Daina’s favorite subject in s...


In [7]:
categories = list(data_raw.columns.values)
categories = categories[:-1]
print(categories)

['birthyear', 'fame', 'gender', 'occupation']


## 2. Data Pre-Processing

In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re

import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [26]:
data = data_raw
# uncomment below to keep a subset of rows for testing
numrows = 1000000
data = data_raw.loc[np.random.choice(data_raw.index, size=numrows)]
data.shape

(1000000, 5)

### 2.1. Cleaning Data

In [27]:
def cleanHtml(sentence):
    #cleanr = re.compile('<.*?>')
    #cleantext = re.sub(cleanr, ' ', str(sentence))
    cleantext = re.sub(r'http\S+', '', sentence)
    return cleantext

def cleanRetweet(sentence):
    return re.sub(r'rt', '', sentence)

def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned

def cleanMentions(sentence):
    return re.sub(r'@#?\b\w\w+\b', '', sentence)

def keepAlphaHash(sentence):
    return ' '.join([w for w in sentence.split() if w.isalpha() or '#' in w])

def emptyToNan(sentence):
    if len(sentence) < 1:
        return np.nan
    else:
        return sentence


In [28]:
data['text'] = data['text'].str.lower()
data['text'] = data['text'].apply(cleanRetweet)
data['text'] = data['text'].apply(cleanHtml)
data['text'] = data['text'].apply(cleanPunc)
data['text'] = data['text'].apply(keepAlphaHash)

# prune empty sentences, replace with NaN and use the built-in dropna() func
data['text'] = data['text'].apply(emptyToNan)
data = data.dropna()
data.head(30)

Unnamed: 0,birthyear,fame,gender,occupation,text
3321958,1971,star,male,sports,i know there is no momentum in a baseball seri...
595074,1973,star,female,performer,#repost umarım içine sindiği kadar yolu da açı...
1679436,1952,star,male,creator,wow farewell to the obamas struck me as farewe...
718310,1991,star,male,performer,thug life #throwbackthursday
3781527,1986,star,male,sports,musiga commends president mahama for setting u...
3247362,1982,star,male,sports,i care what do we have the best celebrations i...
5166763,1974,superstar,male,performer,literally in my hotel room alone making myself...
280593,1949,star,male,science,ready
4304993,1983,star,male,sports,copenhagen i am in you playing tonight at pump...
4445853,1997,star,male,sports,animalstilo


### 2.2. Removing Stop Words

In [29]:
stop_words = set(stopwords.words('english'))
stop_words.update(['zero','one','two','three','four','five','six','seven','eight','nine','ten','may','also','across','among','beside','however','yet','within'])
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)

data['text'] = data['text'].apply(removeStopWords)
data.head(30)

Unnamed: 0,birthyear,fame,gender,occupation,text
3321958,1971,star,male,sports,know momentum baseball series boy ex...
595074,1973,star,female,performer,#repost umarım içine sindiği kadar yolu da açı...
1679436,1952,star,male,creator,wow farewell obamas struck farewell we...
718310,1991,star,male,performer,thug life #throwbackthursday
3781527,1986,star,male,sports,musiga commends president mahama setting tou...
3247362,1982,star,male,sports,care best celebrations league
5166763,1974,superstar,male,performer,literally hotel room alone making laugh ...
280593,1949,star,male,science,ready
4304993,1983,star,male,sports,copenhagen playing tonight pumpehuset ti...
4445853,1997,star,male,sports,animalstilo


### 2.3. Stemming

In [None]:
stemmer = SnowballStemmer("english")
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

data['text'] = data['text'].apply(stemming)
data.head()

In [None]:
data.to_csv('PREPROCESSED.csv')

In [None]:
data = pd.read_csv('PREPROCESSED.csv')

In [None]:
data = data.dropna()

### 2.4. Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, random_state=42, test_size=0.20, shuffle=True)

#train.to_csv('formatted/train.csv')
print(train.shape)
print(test.shape)

In [None]:
train_text = train['text']
test_text = test['text']

### 2.5. TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
#x = v.fit_transform(df['Review'].values.astype('U'))  ## Even astype(str) would work
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
vectorizer.fit(train_text)
vectorizer.fit(test_text)

In [None]:
x_train = vectorizer.transform(train_text)
y_train = train.drop(labels = ['text'], axis=1)

x_test = vectorizer.transform(test_text)
y_test = test.drop(labels = ['text'], axis=1)

## 3. Multi-Label Classification

### 3.1. Multiple Binary Classifications - (One Vs Rest Classifier)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier

In [None]:
categories = list(data.columns.values)
categories = categories[:-1]
print(categories)

In [None]:
%%time
# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),
            ])

for category in categories:
    printmd('**Processing {} comments...**'.format(category))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(x_train, train[category])
    
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(x_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
    print("\n")

### 3.2. Multiple Binary Classifications - (Binary Relevance)

In [25]:
%%time

# using binary relevance
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB

# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = BinaryRelevance(GaussianNB())

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")

TypeError: no supported conversion for types: (dtype('O'),)

### 3.3. Classifier Chains

In [None]:
# using classifier chains
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression

In [None]:
%%time

# initialize classifier chains multi-label classifier
classifier = ClassifierChain(LogisticRegression())

# Training logistic regression model on train data
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")

### 3.4. Label Powerset

In [None]:
# using Label Powerset
from skmultilearn.problem_transform import LabelPowerset

In [None]:
%%time

# initialize label powerset multi-label classifier
classifier = LabelPowerset(LogisticRegression())

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")