<a href="https://colab.research.google.com/github/mohmaed7777/Feedback-Prize---Predicting-Effective-Arguments/blob/main/Feedback_prize_logisticRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!mkdir ~/.kaggle

In [3]:
! cp kaggle.json ~/.kaggle/

In [4]:
! chmod 600 ~/.kaggle/kaggle.json

In [5]:
! kaggle competitions download -c feedback-prize-effectiveness

Downloading feedback-prize-effectiveness.zip to /content
  0% 0.00/8.13M [00:00<?, ?B/s]
100% 8.13M/8.13M [00:00<00:00, 164MB/s]


In [None]:
! unzip feedback-prize-effectiveness.zip

In [7]:
# importing step: 
import numpy as np 
import pandas  as pd 
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from scipy import sparse
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

In [8]:
NFOLDs = 15

In [9]:
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

In [10]:
train_df['text'] = train_df['essay_id'].apply(lambda x: open(f'/content/train/{x}.txt').read())
test_df['text'] = test_df['essay_id'].apply(lambda x: open(f'/content/test/{x}.txt').read())


In [11]:
train_df.tail()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,text
36760,9f63b687e76a,FFA381E58FC6,For many people they don't like only asking on...,Claim,Adequate,Some people may ask multiple people for advice...
36761,9d5bd7d86212,FFA381E58FC6,also people have different views and opinions ...,Claim,Adequate,Some people may ask multiple people for advice...
36762,f1b78becd573,FFA381E58FC6,Advice is something that can impact a persons ...,Position,Adequate,Some people may ask multiple people for advice...
36763,cc184624ca8e,FFA381E58FC6,someone can use everything that many people sa...,Evidence,Ineffective,Some people may ask multiple people for advice...
36764,c8a973681feb,FFA381E58FC6,In conclusion asking for an opinion can be ben...,Concluding Statement,Ineffective,Some people may ask multiple people for advice...


In [12]:
effectiveness_map = {'Ineffective':0, 'Adequate':1, 'Effective':2}
train_df['target'] = train_df['discourse_effectiveness'].map(effectiveness_map)

In [13]:
train_df = train_df.reset_index(drop=True)

In [14]:
skfold = StratifiedKFold(n_splits=NFOLDs,shuffle=True,random_state=NFOLDs)
for i,(train_index, test_index) in enumerate(skfold.split(train_df, train_df["target"])):
    train_df.loc[test_index,"fold"] = i
print(train_df.fold.value_counts())   

0.0     2451
1.0     2451
3.0     2451
7.0     2451
11.0    2451
14.0    2451
5.0     2451
6.0     2451
2.0     2451
9.0     2451
12.0    2451
10.0    2451
8.0     2451
4.0     2451
13.0    2451
Name: fold, dtype: int64


In [15]:
preds = []

In [17]:
for n_fold in range(NFOLDs):
    dataset_tr_ = train_df[train_df['fold']!=n_fold] #use all the discourse_ids which are not marked by current fold index
    dataset_eval_ = train_df[train_df['fold']==n_fold] #use current fold index rows as validation set
         
    # Training, Validation, and Test Dataset
    #discourse_id
    tf = TfidfVectorizer(ngram_range=(1,2),norm='l2', smooth_idf=True)
    tr_discourse_tfidf = tf.fit_transform(dataset_tr_["discourse_text"])
    eval_discourse_tfidf = tf.transform(dataset_eval_["discourse_text"])
    te_discourse_tfidf = tf.transform(test_df["discourse_text"])
    
    #text
    tf = TfidfVectorizer(ngram_range=(1,2),norm='l2', smooth_idf=True) # Load tf another time because it will learn the new vocabulary for 'text'
    tr_text_tfidf = tf.fit_transform(dataset_tr_["text"])
    eval_text_tfidf = tf.transform(dataset_eval_["text"])
    te_text_tfidf = tf.transform(test_df["text"])
    
    #discourse_type
    ohe = OneHotEncoder()
    tr_type_ohe =  sparse.csr_matrix(ohe.fit_transform(dataset_tr_["discourse_type"].values.reshape(-1,1)))
    eval_type_ohe =  sparse.csr_matrix(ohe.transform(dataset_eval_["discourse_type"].values.reshape(-1,1)))
    te_type_ohe =  sparse.csr_matrix(ohe.transform(test_df["discourse_type"].values.reshape(-1,1)))
        
    #Stack each vector representations 
    tr_tfidf = sparse.hstack((tr_type_ohe,tr_discourse_tfidf,tr_text_tfidf))
    eval_tfidf = sparse.hstack((eval_type_ohe,eval_discourse_tfidf,eval_text_tfidf))
    te_tfidf = sparse.hstack((te_type_ohe,te_discourse_tfidf,te_text_tfidf))
    
    #Model
    clf = LogisticRegression(max_iter=1000,penalty="l2",C=1.0131816333513533)
    clf.fit(tr_tfidf, dataset_tr_["target"].values)
    
    #Validation 
    ev_preds = clf.predict_proba(eval_tfidf)
    ev_loss = log_loss(dataset_eval_["target"].values,ev_preds)
    print("Fold : {} EV score: {}".format(n_fold,ev_loss))
    
    #Test
    preds.append(clf.predict_proba(te_tfidf))

Fold : 0 EV score: 0.6324456976392218
Fold : 1 EV score: 0.6274001974649503
Fold : 2 EV score: 0.6104012263762614
Fold : 3 EV score: 0.6157701521744436
Fold : 4 EV score: 0.6259435880861658
Fold : 5 EV score: 0.61943120617009
Fold : 6 EV score: 0.6110717931098326
Fold : 7 EV score: 0.6275809715828793
Fold : 8 EV score: 0.6188842779129688
Fold : 9 EV score: 0.6211024340021408
Fold : 10 EV score: 0.616842590830928
Fold : 11 EV score: 0.6163751140753174
Fold : 12 EV score: 0.6365732691600133
Fold : 13 EV score: 0.6236448627481193
Fold : 14 EV score: 0.6226335324948262


In [18]:
test_predict = clf.predict_proba(te_tfidf)
test_predict

array([[0.01424892, 0.18212951, 0.80362157],
       [0.00817262, 0.7328416 , 0.25898578],
       [0.03190984, 0.38094717, 0.587143  ],
       [0.02651119, 0.31767898, 0.65580983],
       [0.01859565, 0.33328082, 0.64812353],
       [0.03962222, 0.14777993, 0.81259785],
       [0.02343766, 0.2300684 , 0.74649394],
       [0.02287981, 0.34273147, 0.63438872],
       [0.06395677, 0.15991566, 0.77612756],
       [0.0272131 , 0.2703913 , 0.70239561]])

In [19]:
sample = pd.read_csv('/content/sample_submission.csv')
sample.head()

Unnamed: 0,discourse_id,Ineffective,Adequate,Effective
0,a261b6e14276,0.2,0.6,0.4
1,5a88900e7dc1,3.0,6.0,1.0
2,9790d835736b,1.0,2.0,3.0
3,75ce6d68b67b,0.33,0.34,0.33
4,93578d946723,0.01,0.24,0.47


In [20]:
sample.loc[:,"Ineffective"] = test_predict[:,0]
sample.loc[:,"Adequate"] = test_predict[:,1]
sample.loc[:,"Effective"] = test_predict[:,2]
sample.to_csv('submission.csv',index=False)