In [196]:
import sklearn
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn

In [197]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)

### Importing data

In [198]:
topics =["abortion", "cloning", "death_penalty", "gun_control", "marijuana_legalization", "minimum_wage", "nuclear_energy", "school_uniforms"]

In [199]:
def import_data(topic):
    input_file = f'/Users/myrthereuver/PycharmProjects/Claim_reproduction/datasets/ukp/data/complete/{topic}.tsv'
    df_current = pd.read_csv(input_file.format(topic), delimiter = "\t", quoting=3)
    return df_current

In [200]:
data_abortion = import_data("abortion")
data_abortion.head()

Unnamed: 0,topic,retrievedUrl,archivedUrl,sentenceHash,sentence,annotation,set
0,abortion,http://2012election.procon.org/view.additional...,http://web.archive.org/web/20150415052859/http...,a1d2d5656a5029eb558812b8259b6567,This means it has to steer monetary policy to ...,NoArgument,val
1,abortion,http://www.listland.com/top-10-arguments-in-su...,http://web.archive.org/web/20160829133344/http...,a4374eb8cae2c1d52499d0489c7bfb1d,Where did you get that ?,NoArgument,train
2,abortion,http://www.americamagazine.org/issue/feminist-...,http://web.archive.org/web/20160422223822/http...,825b1a5e0e7915950a2a4a657230d530,Nathanson later became pro-life .,NoArgument,val
3,abortion,http://www.strangenotions.com/answering-three-...,http://web.archive.org/web/20160916225634/http...,644379f8e228f50f0871270164878c9b,In this case we may never do evil ( directly a...,Argument_against,train
4,abortion,http://www.healthguidance.org/entry/13561/1/Pr...,http://web.archive.org/web/20160425042210/http...,51eefb36e8947e42403e336536cb00f0,With that I would like to give everyone someth...,NoArgument,test


In [201]:
data_list = [import_data(t) for t in topics]
all_data = pd.concat(data_list)

### Exploring data

In [202]:
data_abortion['set'].value_counts()

train    2827
test      787
val       315
Name: set, dtype: int64

In [203]:
data_abortion.groupby('set')['annotation'].value_counts()

set    annotation      
test   NoArgument           486
       Argument_against     165
       Argument_for         136
train  NoArgument          1746
       Argument_against     591
       Argument_for         490
val    NoArgument           195
       Argument_against      66
       Argument_for          54
Name: annotation, dtype: int64

In [204]:
data_list[3][:3]

Unnamed: 0,topic,retrievedUrl,archivedUrl,sentenceHash,sentence,annotation,set
0,gun control,http://www.theatlantic.com/magazine/archive/20...,http://web.archive.org/web/20160512215933/http...,7f5b3b58c98b7ee686eb8008f6d8d068,"“ I had deep anger when I heard that , ” he to...",NoArgument,train
1,gun control,http://concealedguns.procon.org/,http://web.archive.org/web/20161107160654/http...,5875b612a01b700fdda1d2402efdda16,"According to John R. Lott Jr. , PhD , "" when s...",Argument_for,train
2,gun control,http://navajocodetalkers.org/9-principal-pros-...,http://web.archive.org/web/20160506123220/http...,4fb05a0f3420566ddb0c23b9099c39e9,Education Is The Answer More harsh gun control...,Argument_against,train


#### Leave Topic Out

In [205]:
def leave_topic_out(data, left_out_topic):
    data = data[data.topic != left_out_topic]
    return data

In [206]:
data_without_abortion = leave_topic_out(all_data, "abortion")

In [207]:
data_without_cloning = leave_topic_out(all_data, "cloning")

In [208]:
data_without_deathpen = leave_topic_out(all_data, "death penalty")

In [209]:
data_without_guncontrol = leave_topic_out(all_data, "gun control")

In [210]:
data_without_marijuana = leave_topic_out(all_data, "marijuana legalization")

In [211]:
data_without_minwage = leave_topic_out(all_data, "minimum wage")

In [212]:
data_without_nuclear = leave_topic_out(all_data, "nuclear energy")

In [213]:
data_without_schooluni = leave_topic_out(all_data, "school uniforms")

### Splitting data in pre-defined training, test split

In [214]:
def train_dev_test_split(data):
    test = data[data.set == 'test']
    train = data[data.set == 'train']
    dev = data[data.set == 'val']
    return train, dev, test

In [215]:
topics = ["abortion", "cloning", "death_penalty", "gun_control", "marijuana_legalization", "minimum_wage", "nuclear_energy", "school_uniforms"]

In [216]:
all_data_train, all_data_val, all_data_test = train_dev_test_split(all_data)

#### abortion

In [217]:
data_abortion_train, data_abortion_val, data_abortion_test = train_dev_test_split(data_list[0])

In [218]:
data_without_abortion_train, data_without_abortion_dev, data_without_abortion_test = train_dev_test_split(data_without_abortion)

#### cloning

In [219]:
data_cloning_train, data_cloning_val, data_cloning_test = train_dev_test_split(data_list[1])

In [220]:
data_without_cloning_train, data_without_cloning_dev, data_without_cloning_test = train_dev_test_split(data_without_cloning)

#### death penalty

In [221]:
data_death_train, data_death_val, data_death_test = train_dev_test_split(data_list[2])

In [222]:
data_without_death_train, data_without_death_dev, data_without_death_test = train_dev_test_split(data_without_deathpen)

#### gun control

In [223]:
data_gun_train, data_gun_val, data_gun_test = train_dev_test_split(data_list[3])

In [224]:
data_without_gun_train, data_without_gun_dev, data_without_gun_test = train_dev_test_split(data_without_guncontrol)

#### marijuana

In [225]:
data_marijuana_train, data_marijuana_val, data_marijuana_test = train_dev_test_split(data_list[4])

In [226]:
data_without_marijuana_train, data_without_marijuana_dev, data_without_marijuana_test = train_dev_test_split(data_without_marijuana)

#### minimum wage

In [227]:
data_minwage_train, data_minwage_val, data_minwage_test = train_dev_test_split(data_list[5])

In [228]:
data_without_minwage_train, data_without_abortion_dev, data_without_abortion_test = train_dev_test_split(data_without_minwage)

#### nuclear energy

In [229]:
data_nuclear_train, data_nuclear_val, data_nuclear_test = train_dev_test_split(data_list[6])

In [230]:
data_without_nuclear_train, data_without_nuclear_dev, data_without_nuclear_test = train_dev_test_split(data_without_nuclear)

#### school uniforms

In [231]:
data_schooluni_train, data_schooluni_val, data_schooluni_test = train_dev_test_split(data_list[7])

In [232]:
data_without_schooluni_train, data_without_schooluni_dev, data_without_schooluni_test = train_dev_test_split(data_without_schooluni)

### SVM classifier

#### Preprocessing as mentioned in paper

In [233]:
## Stemming

In [234]:
### lemmatization

#### Pipeline

In [235]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
abortion_train_counts = count_vect.fit_transform(data_abortion_train.sentence)
abortion_train_counts.shape

(2827, 6753)

In [236]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(abortion_train_counts)
abortion_train_tf = tf_transformer.transform(abortion_train_counts)
abortion_train_tf.shape

(2827, 6753)

In [237]:
from sklearn import svm

model = svm.LinearSVC()
model.fit(abortion_train_tf, data_abortion_train.annotation)

LinearSVC()

In [238]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn import svm

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', svm.LinearSVC()),
     ])

In [239]:
text_clf_hyper = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', svm.LinearSVC(C=0.1)),
     ])

In [240]:
topics = ["abortion", "cloning", "death_penalty", "gun_control", "marijuana_legalization", "minimum_wage", "nuclear_energy", "school_uniforms"]

### train all but abortion

In [241]:
model_without_abortion = text_clf.fit(data_without_abortion_train.sentence, data_without_abortion_train.annotation)

In [242]:
model_without_abortion_hyper = text_clf_hyper.fit(data_without_abortion_train.sentence, data_without_abortion_train.annotation)

### train all but cloning

In [243]:
model_without_cloning = text_clf.fit(data_without_cloning_train.sentence, data_without_cloning_train.annotation)

In [244]:
model_without_cloning_hyper = text_clf_hyper.fit(data_without_cloning_train.sentence, data_without_cloning_train.annotation)

### train all but nuclear

In [245]:
model_without_nuclear = text_clf.fit(data_without_nuclear_train.sentence, data_without_nuclear_train.annotation)

In [246]:
model_without_nuclear_hyper = text_clf_hyper.fit(data_without_nuclear_train.sentence, data_without_nuclear_train.annotation)

### train all but school uniform

In [247]:
model_without_schooluni = text_clf.fit(data_without_schooluni_train.sentence, data_without_schooluni_train.annotation)

In [248]:
model_without_schooluni_hyper = text_clf_hyper.fit(data_without_schooluni_train.sentence, data_without_schooluni_train.annotation)

### train all but gun control

In [249]:
model_without_gun = text_clf.fit(data_without_gun_train.sentence, data_without_gun_train.annotation)

In [250]:
model_without_gun_hyper = text_clf_hyper.fit(data_without_gun_train.sentence, data_without_gun_train.annotation)

### train all but death penalty

In [251]:
model_without_death = text_clf.fit(data_without_death_train.sentence, data_without_death_train.annotation)

In [252]:
model_without_death_hyper = text_clf_hyper.fit(data_without_death_train.sentence, data_without_death_train.annotation)

### train all but minimum wage

In [253]:
model_without_minwage = text_clf.fit(data_without_minwage_train.sentence, data_without_minwage_train.annotation)

In [254]:
model_without_minwage_hyper = text_clf_hyper.fit(data_without_minwage_train.sentence, data_without_minwage_train.annotation)

### train all but marijuana

In [255]:
model_without_marijuana = text_clf.fit(data_without_marijuana_train.sentence, data_without_marijuana_train.annotation)

In [256]:
model_without_marijuana_hyper = text_clf_hyper.fit(data_without_marijuana_train.sentence, data_without_marijuana_train.annotation)

#### Fine-tuning hyperparameters

In [257]:
from sklearn.model_selection import GridSearchCV
param_grid = {'clf__C': [0.1,1, 10, 100]}

In [258]:
# grid = GridSearchCV(text_clf,param_grid,refit=True,verbose=2)
# grid.fit(data_without_marijuana_train.sentence, data_without_marijuana_train.annotation)

In [259]:
# print(grid.best_estimator_)

In [260]:
# print(grid.best_params_)

## Results

#### Leave one topic out

In [261]:
def results_model_leavetopic(model, validation, labels=None):
    from sklearn import metrics

    predicted = model.predict(validation['sentence'])
  
    print(metrics.classification_report(validation.annotation, predicted, labels=labels, digits=4))
    accuracy = np.mean(predicted == validation.annotation)
    return f'accuracy = {accuracy}'

#### train all but abortion, test abortion

#### Non-hyper

In [262]:
results_model_leavetopic(model_without_abortion, data_abortion_val)

                  precision    recall  f1-score   support

Argument_against     0.4308    0.4242    0.4275        66
    Argument_for     0.4348    0.3704    0.4000        54
      NoArgument     0.7843    0.8205    0.8020       195

        accuracy                         0.6603       315
       macro avg     0.5500    0.5384    0.5432       315
    weighted avg     0.6503    0.6603    0.6546       315



'accuracy = 0.6603174603174603'

In [263]:
results_model_leavetopic(model_without_abortion, data_abortion_test)

                  precision    recall  f1-score   support

Argument_against     0.4103    0.2909    0.3404       165
    Argument_for     0.3596    0.2353    0.2844       136
      NoArgument     0.7005    0.8374    0.7629       486

        accuracy                         0.6188       787
       macro avg     0.4901    0.4546    0.4626       787
    weighted avg     0.5807    0.6188    0.5916       787



'accuracy = 0.6188055908513341'

In [264]:
results_model_leavetopic(model_without_abortion, data_abortion_test)

                  precision    recall  f1-score   support

Argument_against     0.4103    0.2909    0.3404       165
    Argument_for     0.3596    0.2353    0.2844       136
      NoArgument     0.7005    0.8374    0.7629       486

        accuracy                         0.6188       787
       macro avg     0.4901    0.4546    0.4626       787
    weighted avg     0.5807    0.6188    0.5916       787



'accuracy = 0.6188055908513341'

#### Hyper

In [265]:
results_model_leavetopic(model_without_abortion_hyper, data_abortion_val)

                  precision    recall  f1-score   support

Argument_against     0.5405    0.3030    0.3883        66
    Argument_for     0.5517    0.2963    0.3855        54
      NoArgument     0.7309    0.9333    0.8198       195

        accuracy                         0.6921       315
       macro avg     0.6077    0.5109    0.5312       315
    weighted avg     0.6603    0.6921    0.6550       315



'accuracy = 0.692063492063492'

### train all but school uniform, test school uniform

#### Non hyper

In [266]:
results_model_leavetopic(model_without_schooluni, data_schooluni_val)

                  precision    recall  f1-score   support

Argument_against     0.6000    0.5172    0.5556        58
    Argument_for     0.6471    0.5000    0.5641        44
      NoArgument     0.7771    0.8777    0.8243       139

        accuracy                         0.7220       241
       macro avg     0.6747    0.6316    0.6480       241
    weighted avg     0.7107    0.7220    0.7121       241



'accuracy = 0.7219917012448133'

In [267]:
results_model_leavetopic(model_without_schooluni, data_schooluni_test)

                  precision    recall  f1-score   support

Argument_against     0.5221    0.4041    0.4556       146
    Argument_for     0.4867    0.5046    0.4955       109
      NoArgument     0.7473    0.8098    0.7773       347

        accuracy                         0.6561       602
       macro avg     0.5854    0.5728    0.5761       602
    weighted avg     0.6455    0.6561    0.6483       602



'accuracy = 0.6561461794019934'

In [268]:
# results_model_leavetopic(model_without_schooluni, data_schooluni_test)

#### Hyper

In [269]:
results_model_leavetopic(model_without_schooluni_hyper, data_schooluni_test)

                  precision    recall  f1-score   support

Argument_against     0.5244    0.2945    0.3772       146
    Argument_for     0.5429    0.3486    0.4246       109
      NoArgument     0.6978    0.9049    0.7880       347

        accuracy                         0.6561       602
       macro avg     0.5883    0.5160    0.5299       602
    weighted avg     0.6277    0.6561    0.6225       602



'accuracy = 0.6561461794019934'

### train all but gun control, test gun control

#### Non Hyper

In [270]:
results_model_leavetopic(model_without_gun, data_gun_val)

                  precision    recall  f1-score   support

Argument_against     0.4000    0.3774    0.3883        53
    Argument_for     0.6200    0.4921    0.5487        63
      NoArgument     0.7798    0.8618    0.8187       152

        accuracy                         0.6791       268
       macro avg     0.5999    0.5771    0.5853       268
    weighted avg     0.6671    0.6791    0.6701       268



'accuracy = 0.6791044776119403'

In [271]:
results_model_leavetopic(model_without_gun, data_gun_test)

                  precision    recall  f1-score   support

Argument_against     0.3517    0.3835    0.3669       133
    Argument_for     0.4054    0.4747    0.4373       158
      NoArgument     0.7817    0.7011    0.7392       378

        accuracy                         0.5845       669
       macro avg     0.5129    0.5197    0.5145       669
    weighted avg     0.6074    0.5845    0.5939       669



'accuracy = 0.5844544095665172'

#### Hyper

In [272]:
results_model_leavetopic(model_without_gun_hyper, data_gun_test)

                  precision    recall  f1-score   support

Argument_against     0.4333    0.2932    0.3498       133
    Argument_for     0.4507    0.4051    0.4267       158
      NoArgument     0.7231    0.8360    0.7755       378

        accuracy                         0.6263       669
       macro avg     0.5357    0.5114    0.5173       669
    weighted avg     0.6012    0.6263    0.6085       669



'accuracy = 0.6263079222720478'

### train all but marijuana, test marijuana

#### Non Hyper

In [273]:
results_model_leavetopic(model_without_marijuana, data_marijuana_val)

                  precision    recall  f1-score   support

Argument_against     0.5625    0.1800    0.2727        50
    Argument_for     0.2727    0.0638    0.1034        47
      NoArgument     0.5497    0.9307    0.6912       101

        accuracy                         0.5354       198
       macro avg     0.4616    0.3915    0.3558       198
    weighted avg     0.4872    0.5354    0.4460       198



'accuracy = 0.5353535353535354'

In [274]:
results_model_leavetopic(model_without_marijuana, data_marijuana_test)

                  precision    recall  f1-score   support

Argument_against     0.3261    0.1190    0.1744       126
    Argument_for     0.3000    0.0508    0.0870       118
      NoArgument     0.5615    0.9565    0.7076       253

        accuracy                         0.5292       497
       macro avg     0.3959    0.3755    0.3230       497
    weighted avg     0.4397    0.5292    0.4251       497



'accuracy = 0.5291750503018109'

#### Hyper

In [275]:
results_model_leavetopic(model_without_marijuana_hyper, data_marijuana_test)

                  precision    recall  f1-score   support

Argument_against     0.3529    0.0476    0.0839       126
    Argument_for     0.5556    0.0424    0.0787       118
      NoArgument     0.5329    0.9921    0.6934       253

        accuracy                         0.5272       497
       macro avg     0.4805    0.3607    0.2853       497
    weighted avg     0.4927    0.5272    0.3929       497



'accuracy = 0.5271629778672032'

### train all but death penalty, test death penalty

#### Non hyper

In [276]:
results_model_leavetopic(model_without_death, data_death_val)

                  precision    recall  f1-score   support

Argument_against     0.6024    0.5556    0.5780        90
    Argument_for     0.3333    0.2895    0.3099        38
      NoArgument     0.7853    0.8424    0.8129       165

        accuracy                         0.6826       293
       macro avg     0.5737    0.5625    0.5669       293
    weighted avg     0.6705    0.6826    0.6755       293



'accuracy = 0.6825938566552902'

In [277]:
results_model_leavetopic(model_without_death, data_death_test)

                  precision    recall  f1-score   support

Argument_against     0.4436    0.5086    0.4739       232
    Argument_for     0.2921    0.2524    0.2708       103
      NoArgument     0.7207    0.6843    0.7021       396

        accuracy                         0.5677       731
       macro avg     0.4855    0.4818    0.4823       731
    weighted avg     0.5724    0.5677    0.5689       731



'accuracy = 0.5677154582763337'

#### Hyper

In [278]:
results_model_leavetopic(model_without_death_hyper, data_death_test)

                  precision    recall  f1-score   support

Argument_against     0.4612    0.4871    0.4738       232
    Argument_for     0.2000    0.0485    0.0781       103
      NoArgument     0.6855    0.7980    0.7375       396

        accuracy                         0.5937       731
       macro avg     0.4489    0.4445    0.4298       731
    weighted avg     0.5459    0.5937    0.5609       731



'accuracy = 0.5937072503419972'

### train all but minwage, test minwage

#### Non hyper

In [279]:
results_model_leavetopic(model_without_minwage, data_minwage_val)

                  precision    recall  f1-score   support

Argument_against     0.5135    0.4318    0.4691        44
    Argument_for     0.5476    0.5000    0.5227        46
      NoArgument     0.7479    0.8241    0.7841       108

        accuracy                         0.6616       198
       macro avg     0.6030    0.5853    0.5920       198
    weighted avg     0.6493    0.6616    0.6534       198



'accuracy = 0.6616161616161617'

In [280]:
results_model_leavetopic(model_without_minwage, data_minwage_test)

                  precision    recall  f1-score   support

Argument_against     0.5408    0.4775    0.5072       111
    Argument_for     0.6105    0.5000    0.5498       116
      NoArgument     0.7434    0.8370    0.7875       270

        accuracy                         0.6781       497
       macro avg     0.6316    0.6048    0.6148       497
    weighted avg     0.6672    0.6781    0.6694       497



'accuracy = 0.6780684104627767'

#### Hyper

In [281]:
results_model_leavetopic(model_without_minwage_hyper, data_minwage_test)

                  precision    recall  f1-score   support

Argument_against     0.6667    0.4144    0.5111       111
    Argument_for     0.6500    0.4483    0.5306       116
      NoArgument     0.7126    0.9185    0.8026       270

        accuracy                         0.6962       497
       macro avg     0.6764    0.5937    0.6148       497
    weighted avg     0.6878    0.6962    0.6740       497



'accuracy = 0.6961770623742455'

### train all but nuclear, test nuclear

#### Non Hyper

In [282]:
results_model_leavetopic(model_without_nuclear, data_nuclear_val)

                  precision    recall  f1-score   support

Argument_against     0.5082    0.4559    0.4806        68
    Argument_for     0.5000    0.5000    0.5000        48
      NoArgument     0.7966    0.8294    0.8127       170

        accuracy                         0.6853       286
       macro avg     0.6016    0.5951    0.5978       286
    weighted avg     0.6783    0.6853    0.6813       286



'accuracy = 0.6853146853146853'

In [283]:
results_model_leavetopic(model_without_nuclear, data_nuclear_test)

                  precision    recall  f1-score   support

Argument_against     0.5245    0.6257    0.5707       171
    Argument_for     0.3884    0.3852    0.3868       122
      NoArgument     0.8010    0.7406    0.7696       424

        accuracy                         0.6527       717
       macro avg     0.5713    0.5838    0.5757       717
    weighted avg     0.6649    0.6527    0.6570       717



'accuracy = 0.6527196652719666'

#### Hyper

In [284]:
results_model_leavetopic(model_without_nuclear_hyper, data_nuclear_test)

                  precision    recall  f1-score   support

Argument_against     0.5460    0.5205    0.5329       171
    Argument_for     0.4925    0.2705    0.3492       122
      NoArgument     0.7495    0.8608    0.8013       424

        accuracy                         0.6792       717
       macro avg     0.5960    0.5506    0.5612       717
    weighted avg     0.6572    0.6792    0.6604       717



'accuracy = 0.6792189679218968'

### train all but cloning, test cloning

#### Non hyper

In [285]:
results_model_leavetopic(model_without_cloning, data_cloning_val)

                  precision    recall  f1-score   support

Argument_against     0.6667    0.5075    0.5763        67
    Argument_for     0.5273    0.5179    0.5225        56
      NoArgument     0.7226    0.8250    0.7704       120

        accuracy                         0.6667       243
       macro avg     0.6389    0.6168    0.6231       243
    weighted avg     0.6622    0.6667    0.6598       243



'accuracy = 0.6666666666666666'

In [286]:
results_model_leavetopic(model_without_cloning, data_cloning_test)

                  precision    recall  f1-score   support

Argument_against     0.5607    0.5774    0.5689       168
    Argument_for     0.5039    0.4507    0.4758       142
      NoArgument     0.6990    0.7224    0.7105       299

        accuracy                         0.6190       609
       macro avg     0.5879    0.5835    0.5851       609
    weighted avg     0.6154    0.6190    0.6167       609



'accuracy = 0.6190476190476191'

#### Hyper

In [287]:
results_model_leavetopic(model_without_cloning_hyper, data_cloning_test)

                  precision    recall  f1-score   support

Argument_against     0.6597    0.5655    0.6090       168
    Argument_for     0.5750    0.3239    0.4144       142
      NoArgument     0.6571    0.8462    0.7398       299

        accuracy                         0.6470       609
       macro avg     0.6306    0.5785    0.5877       609
    weighted avg     0.6387    0.6470    0.6278       609



'accuracy = 0.6469622331691297'

### Only train on only one topic

#### train abortion, validation gun control

### Find Features

In [288]:
def train_featuremodel(X):
    #because my method does not work with the Pipeline object, I do the pipeline myself
    from sklearn import svm
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.feature_extraction.text import TfidfTransformer
    model_abortion = svm.LinearSVC()
    count_vect = CountVectorizer()
    train_counts = count_vect.fit_transform(X.sentence)
    tf_transformer = TfidfTransformer(use_idf=False).fit_transform(train_counts)
    model.fit(tf_transformer, X.annotation)
    return model, count_vect, tf_transformer

#### Abortion

In [289]:
abortion_model, count_vect_abortion, tf_transformer_abortion = train_featuremodel(data_abortion_train)

In [290]:
print(abortion_model.classes_) 
print(abortion_model.coef_.shape) 

['Argument_against' 'Argument_for' 'NoArgument']
(3, 6753)


In [291]:
train_counts = count_vect.fit_transform(data_abortion_train.sentence)
tf_transformer = TfidfTransformer(use_idf=False)
tf_transformer.fit(train_counts)

abortion_val_tf = count_vect.transform(data_abortion_test.sentence)
abortion_val_idf = tf_transformer.transform(abortion_val_tf)

In [292]:
predicted = abortion_model.predict(abortion_val_idf)
print(metrics.classification_report(data_abortion_test.annotation, predicted))

NameError: name 'metrics' is not defined

#### cloning

In [None]:
cloning_model, count_vect_cloning, tf_transformer_cloning = train_featuremodel(data_cloning_train)

In [None]:
print(cloning_model.classes_) 
print(cloning_model.coef_.shape) 

In [None]:
train_counts = count_vect.fit_transform(data_cloning_train.sentence)
tf_transformer = TfidfTransformer(use_idf=False)
tf_transformer.fit(train_counts)

cloning_val_tf = count_vect.transform(data_cloning_test.sentence)
cloning_val_idf = tf_transformer.transform(cloning_val_tf)

In [None]:
predicted = cloning_model.predict(cloning_val_idf)
print(metrics.classification_report(data_cloning_test.annotation, predicted))

#### death pen

In [None]:
death_model, count_vect_death, tf_transformer_death = train_featuremodel(data_death_train)

In [None]:
print(death_model.classes_) 
print(death_model.coef_.shape) 

In [None]:
train_counts = count_vect.fit_transform(data_death_train.sentence)
tf_transformer = TfidfTransformer(use_idf=False)
tf_transformer.fit(train_counts)

death_val_tf = count_vect.transform(data_death_test.sentence)
death_val_idf = tf_transformer.transform(death_val_tf)

In [None]:
predicted = death_model.predict(death_val_idf)
print(metrics.classification_report(data_death_test.annotation, predicted))

#### gun control

In [None]:
gun_model, count_vect_gun, tf_transformer_gun = train_featuremodel(data_gun_train)

In [None]:
print(gun_model.classes_) 
print(gun_model.coef_.shape) 

In [None]:
train_counts = count_vect.fit_transform(data_gun_train.sentence)
tf_transformer = TfidfTransformer(use_idf=False)
tf_transformer.fit(train_counts)

gun_val_tf = count_vect.transform(data_gun_test.sentence)
gun_val_idf = tf_transformer.transform(gun_val_tf)

In [None]:
predicted = gun_model.predict(gun_val_idf)
print(metrics.classification_report(data_gun_test.annotation, predicted))

#### marijuana legalization

In [None]:
marijuana_model, count_vect_marijuana, tf_transformer_marijuana = train_featuremodel(data_marijuana_train)

In [None]:
print(marijuana_model.classes_) 
print(marijuana_model.coef_.shape) 

In [None]:
train_counts = count_vect.fit_transform(data_marijuana_train.sentence)
tf_transformer = TfidfTransformer(use_idf=False)
tf_transformer.fit(train_counts)

marijuana_val_tf = count_vect.transform(data_marijuana_test.sentence)
marijuana_val_idf = tf_transformer.transform(marijuana_val_tf)

In [None]:
predicted = gun_model.predict(marijuana_val_idf)
print(metrics.classification_report(data_marijuana_test.annotation, predicted))

#### Minimum wage

In [None]:
minwage_model, count_vect_min, tf_transformer_minwage = train_featuremodel(data_minwage_train)

In [None]:
print(minwage_model.classes_) 
print(minwage_model.coef_.shape) 

In [None]:
train_counts = count_vect.fit_transform(data_minwage_train.sentence)
tf_transformer = TfidfTransformer(use_idf=False)
tf_transformer.fit(train_counts)

minwage_val_tf = count_vect.transform(data_minwage_test.sentence)
minwage_val_idf = tf_transformer.transform(minwage_val_tf)

In [None]:
predicted = minwage_model.predict(minwage_val_idf)
print(metrics.classification_report(data_minwage_test.annotation, predicted))

#### nuclear energy

In [None]:
nuclear_model, count_vect_nuclear, tf_transformer_minwage = train_featuremodel(data_nuclear_train)

In [None]:
print(nuclear_model.classes_) 
print(nuclear_model.coef_.shape) 

In [None]:
train_counts = count_vect.fit_transform(data_nuclear_train.sentence)
tf_transformer = TfidfTransformer(use_idf=False)
tf_transformer.fit(train_counts)

nuclear_val_tf = count_vect.transform(data_nuclear_test.sentence)
nuclear_val_idf = tf_transformer.transform(nuclear_val_tf)

In [None]:
predicted = minwage_model.predict(nuclear_val_idf)
print(metrics.classification_report(data_nuclear_test.annotation, predicted))

#### school uniform

In [None]:
schooluni_model, count_vect_schooluni, tf_transformer_schooluni = train_featuremodel(data_schooluni_train)

In [None]:
print(schooluni_model.classes_) 
print(schooluni_model.coef_.shape) 

In [None]:
train_counts = count_vect.fit_transform(data_schooluni_train.sentence)
tf_transformer = TfidfTransformer(use_idf=False)
tf_transformer.fit(train_counts)

schooluni_val_tf = count_vect.transform(data_schooluni_test.sentence)
schooluni_val_idf = tf_transformer.transform(schooluni_val_tf)

In [None]:
predicted = minwage_model.predict(schooluni_val_idf)
print(metrics.classification_report(data_schooluni_test.annotation, predicted))

##### The shape of svc.coef_ shows that there are 3 sets of weights. These correspond to the following class label pairs: against/For, Against/NoArg, for/NoArg

#### All

In [None]:
all_model, count_vect_all, tf_transformer_all = train_featuremodel(all_data_train)

In [None]:
train_counts = count_vect.fit_transform(all_data_train.sentence)
tf_transformer = TfidfTransformer(use_idf=False)
tf_transformer.fit(train_counts)

all_val_tf = count_vect.transform(all_data_test.sentence)
all_val_idf = tf_transformer.transform(all_val_tf)

In [None]:
predicted = all_model.predict(all_val_idf)
print(metrics.classification_report(all_data_test.annotation, predicted))

In [None]:
def dataframe_coefficients(classifier, class_pair, count_vect, top_features=20):
    coefs_with_fns = sorted(zip(model.coef_[class_pair], feature_names)) 
    df=pd.DataFrame(coefs_with_fns)
    df.columns='coefficient','word'
    df.sort_values(by='coefficient')
    
    if class_pair == 0:
        positive = "For"
        negative = "NoArg"
    if class_pair == 1:
        positive = "NoArg"
        negative = "For"
    if class_pair == 2:
        positive = "Against"
        negative = "NoArg"
    
#     df_negativeclass = df[df['coefficient'] < 0]
    df_negativeclass = df.sort_values(by='coefficient', ascending=False)
    df_negative_top = df_negativeclass[:top_features]
    
#     df_positiveclass = df[df['coefficient'] > 0]
    df_positiveclass = df.sort_values(by='coefficient')
    df_positive_top = df_positiveclass[:top_features]
    
    print(negative)
    print(df_negative_top)
    print("//////")
    print("######")
    print("\\\\\\")
    print(positive)
    print(df_positive_top)
    
    return df, df_positive_top, df_negative_top

#### Positive = for, Negative = No Arg

In [None]:
a, n, p = dataframe_coefficients(all_model, 1, tf_transformer, top_features=10)

In [None]:
n["word"]

#### Positive = Against, Negative = NoArg

In [None]:
a, n, p = dataframe_coefficients(all_model, 2, feature_names, top_features=10)

In [None]:
p["word"]

#### Positive = Against, Negative = For

In [None]:
a, n, p = dataframe_coefficients(all_model, 0, tf_transformer, top_features=10)

In [None]:
n["word"]

In [None]:
### Negative coefficients correspond to anti, positive coefficients to neutral

In [None]:
df_NegativeClass = 

df_PositiveClass = 

In [None]:
### 