# Artificial Intelligence II: HW1 Tutorial

In [72]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
import pandas as pd

## Loading and Exploring the Dataset

In [73]:
train_df = pd.read_csv("VaccineSentimentDataset/vaccine_train_set.csv", index_col=0)
train_df

Unnamed: 0,tweet,label
0,Sip N Shop Come thru right now #Marjais #Popul...,0
1,I don't know about you but My family and I wil...,1
2,@MSignorile Immunizations should be mandatory....,2
3,President Obama spoke in favor of vaccination ...,0
4,"""@myfoxla: Arizona monitoring hundreds for mea...",0
...,...,...
15971,@Salon if u believe the anti-vax nutcases caus...,1
15972,How do you feel about parents who don't #vacci...,0
15973,70 Preschoolers Tested for Measles in Simi Val...,0
15974,Finance Minister: Budget offers room to procur...,0


In [74]:
train_df.describe()

Unnamed: 0,label
count,15976.0
mean,0.936592
std,0.93074
min,0.0
25%,0.0
50%,1.0
75%,2.0
max,2.0


In [75]:
test_df = pd.read_csv("VaccineSentimentDataset/vaccine_validation_set.csv", index_col=0)
test_df

Unnamed: 0,tweet,label
0,@user They had a massive surge in with covid d...,1
1,Required vaccines for school: Parents and guar...,0
2,“@KCStar: Two more Johnson County children hav...,0
3,NV can do better. Which states are the best (a...,2
4,Nothing like killing ourselves w/ our own fear...,2
...,...,...
2277,RT @abc7: Number of measles cases reported in ...,0
2278,"Evidence points to the idea that ""measles affe...",0
2279,"Where's @SavedYouAClick ""@voxdotcom: Why you s...",2
2280,Some of my favorite people have autism. If tha...,2


In [76]:
test_df.describe()

Unnamed: 0,label
count,2282.0
mean,0.936897
std,0.93096
min,0.0
25%,0.0
50%,1.0
75%,2.0
max,2.0


## Visualize Data

In [None]:
import seaborn as sn
sn.countplot(x='label',data=train_df)

In [None]:
sn.countplot(x='label',data=test_df)

## Data Pre-processing

In [77]:
#Check for null values
train_df.isnull().sum()

tweet    0
label    0
dtype: int64

In [78]:
test_df.isnull().sum()

tweet    0
label    0
dtype: int64

#### Separating features from targets

In [79]:
X_train = train_df.drop('label', axis=1)
Y_train = train_df['label']
X_train

Unnamed: 0,tweet
0,Sip N Shop Come thru right now #Marjais #Popul...
1,I don't know about you but My family and I wil...
2,@MSignorile Immunizations should be mandatory....
3,President Obama spoke in favor of vaccination ...
4,"""@myfoxla: Arizona monitoring hundreds for mea..."
...,...
15971,@Salon if u believe the anti-vax nutcases caus...
15972,How do you feel about parents who don't #vacci...
15973,70 Preschoolers Tested for Measles in Simi Val...
15974,Finance Minister: Budget offers room to procur...


In [80]:
Y_train

0        0
1        1
2        2
3        0
4        0
        ..
15971    1
15972    0
15973    0
15974    0
15975    2
Name: label, Length: 15976, dtype: int64

In [81]:
X_test = test_df.drop('label', axis=1)
Y_test = test_df['label']
X_test

Unnamed: 0,tweet
0,@user They had a massive surge in with covid d...
1,Required vaccines for school: Parents and guar...
2,“@KCStar: Two more Johnson County children hav...
3,NV can do better. Which states are the best (a...
4,Nothing like killing ourselves w/ our own fear...
...,...
2277,RT @abc7: Number of measles cases reported in ...
2278,"Evidence points to the idea that ""measles affe..."
2279,"Where's @SavedYouAClick ""@voxdotcom: Why you s..."
2280,Some of my favorite people have autism. If tha...


In [82]:
Y_test

0       1
1       0
2       0
3       2
4       2
       ..
2277    0
2278    0
2279    2
2280    2
2281    0
Name: label, Length: 2282, dtype: int64

#### Text pre-processing

In [98]:
from Preprocess import *

X_train = preprocess(X_train)
X_train

  dataset['tweet'] = dataset['tweet'].str.replace('[^\w\s]', '')  # replace(r'^https?:\/\/.*[\r?\n\t@]*', '')


KeyboardInterrupt: 

In [None]:
X_test = preprocess(X_test)
X_test

#### Vectorize words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

#with unigrams without any optimization for now

vectorizer = CountVectorizer()
vectorizer.fit(X_train['tweet'])

train_vec = vectorizer.transform(X_train['tweet'])
print(vectorizer.get_feature_names())

In [None]:
#with unigrams without any optimization for now

vectorizer = CountVectorizer()
vectorizer.fit(X_test['tweet'])

test_vec = vectorizer.transform(X_test['tweet'])
print(vectorizer.get_feature_names())

## Train model

In [None]:
#We will experiment with Ridge Regression
clf = LogisticRegression()
clf.fit(train_vec, Y_train)

## Evaluate model's performance

In [None]:
# Now that we have trained the classifer, we can make predictions on the unseen data
Y_test_pred = clf.predict(test_vec)
print(Y_test_pred)
# Let's also make predictions on the train set for reference
Y_train_pred = clf.predict(train_vec)

In [None]:
test_mse = mean_squared_error(Y_test, Y_test_pred)
print(f"Our classifier achieves a MSE of {test_mse:.2f} on the test set")
train_mse = mean_squared_error(Y_train, Y_train_pred)
print(f"Our classifier achieves a MSE of {train_mse:.2f} on the train set")

## Notes

* Scaling features
* Experimenting with different models 
* Using different hyperparameters for each model
* Testing which of the features are really helpful
* Creating additional synthetic features
* And many more...
