In [None]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df_train  = pd.read_csv('/content/drive/MyDrive/TP3-Orga/Training.csv', sep="\t", header=None)
df_test = pd.read_csv('/content/drive/MyDrive/TP3-Orga/Test.csv', sep="\t", header=None)

In [None]:
df_train.columns = ["text", "target"]
df_test.columns = ["text", "target"]

In [None]:
X_train = df_train.drop("target", axis=1)
X_test = df_test.drop("target", axis=1)

y_train = df_train["target"]
y_test = df_test["target"]

In [None]:
y_train = y_train.map(lambda x: 1 if x == 'Discrimination' else 0)
y_test = y_test.map(lambda x: 1 if x == 'Discrimination' else 0)

In [None]:
from nltk import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
X_train_token = X_train["text"].map(lambda x: word_tokenize(x.lower(), language='english'))

In [None]:
X_test_token = X_test["text"].map(lambda x: word_tokenize(x.lower(), language = 'english'))

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwordsEn = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def remove_stopwords(texto, stopwords):
  return [palabra for palabra in texto if palabra not in stopwords]

In [None]:
X_train_token = X_train_token.map(lambda x: remove_stopwords(x, stopwordsEn))
X_test_token = X_test_token.map(lambda x: remove_stopwords(x, stopwordsEn))

In [None]:
X_train_token =  X_train_token.map(lambda x: ' '.join(x))
X_test_token =  X_test_token.map(lambda x: ' '.join(x))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
TfIdf = TfidfVectorizer()

In [None]:
X_train_vec = TfIdf.fit_transform(X_train_token)
X_test_vec = TfIdf.transform(X_test_token)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score

In [None]:
lr = LogisticRegression(random_state=10)

In [None]:
params = [
    {"C": [0.25, 0.5, 1, 1.25]},
    {"max_iter": [25, 50, 100, 125]}
]

In [None]:
gs = GridSearchCV(lr, param_grid = params, cv=5, scoring='roc_auc')

In [None]:
gs.fit(X_train_vec, y_train)

In [None]:
gs.best_params_

{'C': 1.25}

In [None]:
lr_clf = gs.best_estimator_

In [None]:
lr_clf.fit(X_train_vec, y_train)

In [None]:
roc_auc_score(y_test, lr_clf.predict_proba(X_test_vec)[:, 1])

0.9395061728395062

In [None]:
accuracy_score(y_test, lr_clf.predict(X_test_vec))

0.8833333333333333

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(random_state=10)

In [None]:
params = [
    {'n_estimators': [40, 60, 80, 100, 120, 140]},
    {'max_depth': [5, 10, 15, 20]},
    {"min_samples_split": [2, 5, 8, 10]},
    {"min_samples_leaf": [1, 2, 4]},
    {"max_features": ["sqrt", "log2", None]},
    {"bootstrap" : [True, False]},
    {"oob_score" : [True, False]},
    {"random_state":[10]}
]

In [None]:
gs = GridSearchCV(rf, param_grid = params, cv=5, scoring='roc_auc')

In [None]:
gs.fit(X_train_vec, y_train)

In [None]:
gs.best_params_

{'max_features': 'log2'}

In [None]:
rf_clf = gs.best_estimator_

In [None]:
rf_clf.fit(X_train_vec, y_train)

In [None]:
roc_auc_score(y_test, rf_clf.predict_proba(X_test_vec)[:, 1])

0.9419753086419753

In [None]:
accuracy_score(y_test, rf_clf.predict(X_test_vec))

0.8222222222222222

#Weakly

In [None]:
df_weakly_disc  = pd.read_csv('/content/drive/MyDrive/TP3-Orga/weakly-discrimination.txt', header=None, sep = "\t", names=['text'])
df_weakly_non_disc = pd.read_csv('/content/drive/MyDrive/TP3-Orga/weakly-non-discrimination.txt',header=None, sep="\t", names=['text'])

In [None]:
df_weakly_disc['target'] = 1

In [None]:
df_weakly_non_disc['target'] = 0

In [None]:
df_weakly = pd.concat([df_weakly_disc, df_weakly_non_disc], ignore_index=True)

In [None]:
X_weakly = df_weakly.drop("target", axis=1)

y_weakly = df_weakly["target"]

In [None]:
X_weakly_token = X_weakly["text"].map(lambda x: word_tokenize(x.lower(), language='english'))
X_weakly_token = X_weakly_token.map(lambda x: remove_stopwords(x, stopwordsEn))
X_weakly_token =  X_weakly_token.map(lambda x: ' '.join(x))
X_weakly_vec = TfIdf.transform(X_weakly_token)

In [None]:
preds_lr = lr_clf.predict(X_weakly_vec)

In [None]:
preds_rf = rf_clf.predict(X_weakly_vec)

In [None]:
preds_lr

array([1, 1, 1, ..., 1, 1, 0])

In [None]:
preds_rf

array([1, 1, 1, ..., 1, 1, 0])

In [None]:
y_weakly

0        1
1        1
2        1
3        1
4        1
        ..
11436    0
11437    0
11438    0
11439    0
11440    0
Name: target, Length: 11441, dtype: int64

In [None]:
def definir_clasificacion(textos, weakly_clf, lr_clf, rf_clf):
  targets = []
  for i in range(len(textos)):
    suma = weakly_clf[i] + lr_clf[i] + rf_clf[i]
    if suma >= 2: #Es decir 2 o mas votaron 1
      targets.append(1)
    else:
      targets.append(0)

  return targets

In [None]:
targets_weakly = definir_clasificacion(X_weakly, y_weakly, preds_lr, preds_rf)

In [None]:
len(targets_weakly)

11441

Algunos ejemplos

In [None]:
targets_weakly[10], y_weakly[10], preds_lr[10], preds_rf[10]

(1, 1, 1, 1)

In [None]:
targets_weakly[100], y_weakly[100], preds_lr[100], preds_rf[100]

(1, 1, 0, 1)

In [None]:
targets_weakly[10000], y_weakly[10000], preds_lr[10000], preds_rf[10000]

(0, 0, 0, 0)

In [None]:
df_weakly_classified = pd.concat([X_weakly, pd.Series(targets_weakly)], axis=1)

In [None]:
df_weakly_classified.columns = ['text', 'target']

In [None]:
df_weakly_classified

Unnamed: 0,text,target
0,When white students complain about Affirmative...,1
1,RT @ClaytonCrook: When the univ cancels class ...,1
2,#BlackOnCampus feeling the need to tone down m...,1
3,Are you sure you want to wear your hair that w...,1
4,"#BlackOnCampus Terrorism being called ""bully...",1
...,...,...
11436,"Discrimination in any way, bully, only has pow...",1
11437,.@JeremyPatzer Public broadcaster needs to con...,0
11438,"UChicago is full of SHIT. Like, they are alrea...",1
11439,@SunSport @Tyson_Fury this proves that trave...,1


In [None]:
df_weakly_classified.to_csv('/content/drive/MyDrive/TP3-Orga/weakly_train.csv')

In [None]:
df_train

Unnamed: 0,text,target
0,"#BlackOnCampus never being called on in class,...",Discrimination
1,First invite 2 the white parties and realize y...,Discrimination
2,"being labeled as the ""token black guy"" in arch...",Discrimination
3,male teacher comments 'that's quite a hefty bo...,Discrimination
4,I only got a scholarship to college because I'...,Discrimination
...,...,...
415,In my experience grit and resilience can be ta...,Non-Discrimination
416,Raymond Shrimp Boy Chow guilty 162 counts incl...,Non-Discrimination
417,#news by #almalki: VW proposes catalytic conve...,Non-Discrimination
418,Crews rescue worker trapped hole street collap...,Non-Discrimination


In [None]:
df_test

Unnamed: 0,text,target
0,"Yeah I dont take out the bins, cause its a boy...",Discrimination
1,"#BlackOnCampus, when you tryout for the swim t...",Discrimination
2,"We're black, nothing wrong with it.. We've alw...",Discrimination
3,#BlackOnCampus when your lawschool property pr...,Discrimination
4,Soccer is more fun when you explain it by trea...,Discrimination
...,...,...
175,That angel hearted man who trying to end homel...,Non-Discrimination
176,@jagohindustani_ Millions r supporting #Asaram...,Non-Discrimination
177,Rookie defender activated tonight's game vs CO...,Non-Discrimination
178,Houthis release video Saudi soldier held HOSTA...,Non-Discrimination


In [None]:
df_test_labelled = pd.concat([df_train, df_test])

In [None]:
df_test_labelled['target'] = df_test_labelled['target'].map(lambda x: 1 if x == 'Discrimination' else 0)

In [None]:
df_test_labelled

Unnamed: 0,text,target
0,"#BlackOnCampus never being called on in class,...",1
1,First invite 2 the white parties and realize y...,1
2,"being labeled as the ""token black guy"" in arch...",1
3,male teacher comments 'that's quite a hefty bo...,1
4,I only got a scholarship to college because I'...,1
...,...,...
175,That angel hearted man who trying to end homel...,0
176,@jagohindustani_ Millions r supporting #Asaram...,0
177,Rookie defender activated tonight's game vs CO...,0
178,Houthis release video Saudi soldier held HOSTA...,0


In [None]:
df_test_labelled.to_csv('/content/drive/MyDrive/TP3-Orga/labelled_test.csv')