# Linear regression model and baseline model


In [1]:
# import packages

import numpy as np
import pandas as pd
import tensorflow as tf
import datetime
import matplotlib.pyplot as plt
import pickle

from utils.tf_utils import random_mini_batches

from sklearn.model_selection import train_test_split
import re
import string

In [2]:
# load data

def load_data(train_or_test):
    if train_or_test not in ['train', 'test']:
        return None
    path = 'data\CombinedTWTAQ_' + train_or_test + '.pkl'
    with open(path, 'rb') as infile:
        df = pickle.load(infile)
    return df

def clean_text(text):
    '''
    lowercase, remove puncs, remove urls
    '''
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r"http\S+", "", text)
    return text
    
clean = lambda x: clean_text(x)

def df_to_xy(df):
    x = df.text.apply(clean)
    y = df.PP.apply(lambda xx: xx[75])
    y = y.values
    return (x,y)


def sqrt_mse(y):
    return np.sqrt( np.sum( np.square(y) ) / len(y) )

## Load data

In [3]:

df_train = load_data('train')
x, y = df_to_xy(df_train)
x_train, x_dev, y_train, y_dev = train_test_split(x,y, test_size=200, random_state=50)
print(sqrt_mse(y_train), sqrt_mse(y_dev))

m_train = x_train.shape[0]
m_dev = x_dev.shape[0]

0.15128836169147417 0.15246748337145208
(4472, 1145)
(1145,)
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)


In [4]:
type(x_train)

scipy.sparse.csr.csr_matrix

## Model

In [5]:
print( np.sqrt( np.mean( np.power(y_train, 2) ) ) )
print( np.sqrt( np.mean( np.power(y_dev, 2) ) ) )

0.15128836169147417
0.15246748337145208


## Conclusion

Guess 0:
- Train error: 15.13
- Dev error: 15.25
- Test error: 15.26


In [7]:
df_test = load_data('test')

x_test, y_test = df_to_xy(df_test)
print( sqrt_mse(y_test) )

0.15264363624391106
