In [8]:
import os
import pandas as pd
import nltk
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))


## Loading the dataset

In [9]:
DATA_PATH = '../../Data' ## path to the folder where the dataset is there

csv_path_train = os.path.join(DATA_PATH, 'split_train.csv')
csv_path_val = os.path.join(DATA_PATH, 'split_val.csv')
csv_path_test = os.path.join(DATA_PATH, 'split_test.csv')

image_path = os.path.join(DATA_PATH, 'images')

# text_data_column = 'Text Normalized'
text_data_column = 'Text Transcription'

def read_data_from_csv(path_name):
	df = pd.read_csv(path_name, usecols=['file_name', 'misogynous', text_data_column], sep='\t')
	path = image_path+'/'
	df['image_path'] = path + df['file_name']

	return df

## Reading data
train_df = read_data_from_csv(csv_path_train)
val_df = read_data_from_csv(csv_path_val)
test_df = read_data_from_csv(csv_path_test)

print(train_df.shape)
target_names = ['non-misogynistic', 'misogynistic']

(7000, 4)


## Naive_Bayes

In [47]:
import nltk
from nltk.stem.snowball import SnowballStemmer


stemmer = SnowballStemmer("english", ignore_stopwords=True)
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

In [48]:
## Naive Bayes
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

text_clf = text_clf.fit(train_df[text_data_column], train_df['misogynous'])

predicted_nb = text_clf.predict(test_df[text_data_column])
score_nb = np.mean(predicted_nb == test_df['misogynous'])
print(score_nb)

0.815


In [49]:
print(classification_report(test_df['misogynous'], predicted_nb, target_names=target_names, digits =4))

                  precision    recall  f1-score   support

non-misogynistic     0.8307    0.8066    0.8184       517
    misogynistic     0.7992    0.8240    0.8114       483

       micro avg     0.8150    0.8150    0.8150      1000
       macro avg     0.8149    0.8153    0.8149      1000
    weighted avg     0.8155    0.8150    0.8151      1000



In [50]:
## Naive Bayes with Stemming
text_clf = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

text_clf = text_clf.fit(train_df[text_data_column], train_df['misogynous'])

predicted_nb = text_clf.predict(test_df[text_data_column])
score_nb = np.mean(predicted_nb == test_df['misogynous'])
# print(predicted)
print(score_nb)
print(classification_report(test_df['misogynous'], predicted_nb, target_names=target_names, digits =3))

0.79
                  precision    recall  f1-score   support

non-misogynistic      0.809     0.778     0.793       517
    misogynistic      0.771     0.803     0.787       483

       micro avg      0.790     0.790     0.790      1000
       macro avg      0.790     0.790     0.790      1000
    weighted avg      0.791     0.790     0.790      1000



In [53]:
## Analysis of the top words in 
import numpy as np
def show_top10(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        top10 = np.argsort(classifier.feature_log_prob_[i])[-20:]
        print("%s: %s" % (category, " ".join(feature_names[top10])))

categories = ['non-misogynistic','misogynistic']
show_top10(text_clf.named_steps['clf'], text_clf.named_steps['vect'], categories)

non-misogynistic: woman kitchen day work time women want cook meme cheat don clean peopl just like wife hous imgflip girlfriend com
misogynistic: memegener man want kitchen meme don memecent just look imgflip make quickmem men net feminist woman like girl women com


In [217]:
np.exp(np.sort(text_clf['clf'].feature_log_prob_[0])[-10:])

array([0.00152367, 0.00161975, 0.00165073, 0.00165584, 0.00174221,
       0.00189004, 0.00236661, 0.00243541, 0.0025502 , 0.00326558])

In [222]:
classifier = text_clf['clf']

In [225]:
np.sort(classifier.feature_log_prob_[1]- classifier.feature_log_prob_[0])

array([-2.2752985 , -2.18216691, -2.06796397, ...,  2.6635885 ,
        2.70756679,  3.19959238])

In [218]:
np.exp(np.sort(text_clf['clf'].feature_log_prob_[1])[-10:])

array([0.00208433, 0.00215492, 0.00220405, 0.00221154, 0.00254434,
       0.00283106, 0.00327792, 0.00388127, 0.00532711, 0.00668653])

## SVM

In [19]:
## SVM
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=5, random_state=42))])

text_clf_svm = text_clf_svm.fit(train_df[text_data_column], train_df['misogynous'])
predicted_svm = text_clf_svm.predict(test_df[text_data_column])
score_svm = np.mean(predicted_svm == test_df['misogynous'])
print(score_svm)

0.789




In [20]:
print(classification_report(test_df['misogynous'], predicted_svm, target_names=target_names, digits =4))

                  precision    recall  f1-score   support

non-misogynistic     0.7593    0.8665    0.8094       517
    misogynistic     0.8317    0.7060    0.7637       483

       micro avg     0.7890    0.7890    0.7890      1000
       macro avg     0.7955    0.7863    0.7866      1000
    weighted avg     0.7943    0.7890    0.7873      1000



In [21]:
## SVM new with stemming

text_clf_svm = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=5, random_state=42))])

text_clf_svm = text_clf_svm.fit(train_df[text_data_column], train_df['misogynous'])
predicted_svm = text_clf_svm.predict(test_df[text_data_column])
score_svm = np.mean(predicted_svm == test_df['misogynous'])
print(score_svm)
print(classification_report(test_df['misogynous'], predicted_svm, target_names=target_names, digits =3))

0.792
                  precision    recall  f1-score   support

non-misogynistic      0.750     0.897     0.817       517
    misogynistic      0.861     0.679     0.759       483

       micro avg      0.792     0.792     0.792      1000
       macro avg      0.805     0.788     0.788      1000
    weighted avg      0.803     0.792     0.789      1000





## Logistic Regression

In [22]:

LogReg_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', LogisticRegression(solver='sag'))
            ])

lr_clf = LogReg_pipeline.fit(train_df[text_data_column], train_df['misogynous'])
pred_lr = lr_clf.predict(test_df[text_data_column])
print(classification_report(test_df['misogynous'], pred_lr, target_names=target_names, digits=4))
acc_score = accuracy_score(test_df['misogynous'], pred_lr)
print("Accuracy = {}".format(acc_score))

                  precision    recall  f1-score   support

non-misogynistic     0.7981    0.8259    0.8118       517
    misogynistic     0.8065    0.7764    0.7911       483

       micro avg     0.8020    0.8020    0.8020      1000
       macro avg     0.8023    0.8012    0.8015      1000
    weighted avg     0.8021    0.8020    0.8018      1000

Accuracy = 0.802


In [23]:
#### LOGREG WITH STEMMING
LogReg_pipeline = Pipeline([('vect', stemmed_count_vect),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(solver='sag'))
            ])



lr_clf = LogReg_pipeline.fit(train_df[text_data_column], train_df['misogynous'])
pred_lr = lr_clf.predict(test_df[text_data_column])
print(classification_report(test_df['misogynous'], pred_lr, target_names=target_names, digits=3))
acc_score = accuracy_score(test_df['misogynous'], pred_lr)
print("Accuracy = {}".format(acc_score))

                  precision    recall  f1-score   support

non-misogynistic      0.801     0.834     0.817       517
    misogynistic      0.814     0.778     0.796       483

       micro avg      0.807     0.807     0.807      1000
       macro avg      0.807     0.806     0.806      1000
    weighted avg      0.807     0.807     0.807      1000

Accuracy = 0.807


## Decision Tree with Stemming

In [24]:
#### Decision Tree WITH STEMMING
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

DT_pipeline = Pipeline([('vect', stemmed_count_vect),
                ('tfidf', TfidfTransformer()),
                ('clf', DecisionTreeClassifier())
            ])



dt_clf = DT_pipeline.fit(train_df[text_data_column], train_df['misogynous'])
pred_lr = dt_clf.predict(test_df[text_data_column])
print(classification_report(test_df['misogynous'], pred_lr, target_names=target_names, digits=3))
acc_score = accuracy_score(test_df['misogynous'], pred_lr)
print("Accuracy = {}".format(acc_score))

                  precision    recall  f1-score   support

non-misogynistic      0.746     0.660     0.700       517
    misogynistic      0.676     0.760     0.715       483

       micro avg      0.708     0.708     0.708      1000
       macro avg      0.711     0.710     0.708      1000
    weighted avg      0.712     0.708     0.708      1000

Accuracy = 0.708


In [32]:
# NLTK
# Removing stop words
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), 
                     ('clf', MultinomialNB())])

# Stemming Code

import nltk
# nltk.download()

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
    
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), 
                             ('mnb', MultinomialNB(fit_prior=False))])

text_mnb_stemmed = text_mnb_stemmed.fit(train_df[text_data_column], train_df['misogynous'])

predicted_mnb_stemmed = text_mnb_stemmed.predict(test_df[text_data_column])

np.mean(predicted_mnb_stemmed == test_df['misogynous'])

0.79

## Storing TF-IDF features in pickle file

In [38]:
## Naive Bayes
# text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
import nltk
from nltk.stem.snowball import SnowballStemmer


stemmer = SnowballStemmer("english", ignore_stopwords=True)
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('mnb', MultinomialNB()),
 ])

X_train_data = train_df[text_data_column]

clf = text_mnb_stemmed.fit(X_train_data, train_df['misogynous'])


tfidf_transformer = TfidfVectorizer(use_idf= True, stop_words = stop_words)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_data)
print(X_train_tfidf.shape)

# text_clf = text_clf.fit(X_train_tfidf, train_df['misogynous'])



# clf = MultinomialNB().fit(X_train_tfidf, train_df['misogynous'])

# X_test_data = test_df[text_data_column]
X_test = tfidf_transformer.transform(X_test_data)

# X_val_data = val_df[text_data_column]
X_val = tfidf_transformer.transform(X_val_data)

# X_train_data = train_df[text_data_column]
X_train = tfidf_transformer.transform(X_train_data)

print(X_test.shape)
print(X_train.shape)
print(X_val.shape)

preds = clf.predict(X_test_data)
score_nb = np.mean(preds == test_df['misogynous'])
# print(predicted)
print(score_nb)

preds = clf.predict(X_val_data)
score_nb = np.mean(preds == val_df['misogynous'])
# print(predicted)
print(score_nb)


preds = clf.predict(X_train_data)
score_nb = np.mean(preds == train_df['misogynous'])
# print(predicted)
print(score_nb)


(7000, 15993)
(1000, 15993)
(7000, 15993)
(2000, 15993)
0.805
0.7605
0.8735714285714286


In [40]:
X_train = clf.named_steps['tfidf'].transform(clf.named_steps['vect'].transform(X_train_data))
X_val = clf.named_steps['tfidf'].transform(clf.named_steps['vect'].transform(X_val_data))
X_test = clf.named_steps['tfidf'].transform(clf.named_steps['vect'].transform(X_test_data))
print(X_test.shape)
print(X_train.shape)
print(X_val.shape)

(1000, 12804)
(7000, 12804)
(2000, 12804)


In [41]:
np.array(X_test.todense()).shape

(1000, 12804)

In [42]:
import pickle

tf_idf = {}

tf_idf['train'] = np.array(X_train.todense())
tf_idf['val'] = np.array(X_val.todense())
tf_idf['test'] = np.array(X_test.todense())

with open('tf_features.pickle', 'wb') as handle:
    pickle.dump(tf_idf, handle, protocol=pickle.HIGHEST_PROTOCOL) 