In [3]:
import pandas as pd
import os
os.chdir(address)

columns=["submission_time", "upvotes", "url", "headline"]

submissions = pd.read_csv("sel_hn_stories.csv",header=None,names=columns)

submissions.head()

Unnamed: 0,submission_time,upvotes,url,headline
0,2014-06-24T05:50:40.000Z,1,flux7.com,8 Ways to Use Docker in the Real World
1,2010-02-17T16:57:59Z,1,blog.jonasbandi.net,Software: Sadly we did adopt from the construc...
2,2014-02-04T02:36:30Z,1,blogs.wsj.com,Google’s Stock Split Means More Control for L...
3,2011-10-26T07:11:29Z,1,threatpost.com,SSL DOS attack tool released exploiting negoti...
4,2011-04-03T15:43:44Z,67,algorithm.com.au,Immutability and Blocks Lambdas and Closures


In [4]:
#percent of missing values for each column
pd.DataFrame(submissions.isnull().sum()/submissions.shape[0]*100,columns=['% Missing Values']).round(2)

Unnamed: 0,% Missing Values
submission_time,0.0
upvotes,0.0
url,6.3
headline,0.33


In [3]:
#who has posted the most? and how many?
print('Highest number of posts is {1} made by {0}'.format(submissions['author'].value_counts().index.tolist()[0],submissions['author'].value_counts().tolist()[0]))

Highest number of posts is 6311 made by shawndumas


Oh wow! such an active user! let's see how much points he has recieved?

In [4]:
shawn=submissions[submissions['author']=='shawndumas']
print('shawn recieved {0:.2f} average points, while average points for all posts is {1:.2f}'.format(shawn['points'].mean(),submissions['points'].mean()))

shawn recieved 9.27 average points, while average points for all posts is 10.24


Okay, this doesn't suggest a relationship between the user and the upvotes.  

I dont plan to utilize url_hostname for now. The only columns I will be using for regression analysis are headlines and points. Let's drop url column and then drop na title rows.

In [5]:
#submissions=submissions.loc[:,['title','points']]
submissions=submissions.dropna()
submissions.shape

(2801, 4)

This is fairly large, I will only use 5% of the data for now.

In [6]:
submissions=submissions.sample(frac=0.05,axis=0).reset_index()
submissions.head()

Unnamed: 0,index,title,points
0,1293117,Electric Cigarette - TOP Rated Electric Cigare...,1
1,846234,Import contacts from Google by entering your e...,2
2,546115,The Dot Com Boom Is Giving Way To The Dot Chin...,1
3,1403870,Some practical questions to ask oneself to get...,12
4,22551,Configuration Management as a Service,1


There are four ways to remove punctuations:
* **sets**
    - exclude = set(string.punctuation) \n s = ''.join(ch for ch in s if ch not in exclude)
* **regex**
    - s = re.sub(r'[^\w\s]','',s) OR re.compile('[%s]' % re.escape(string.punctuation)).sub('',s)
* **translate**
    - s = s.translate(str.maketrans('','',string.punctuation))
* **replace**
    - for c in string.punctuation: \n s=s.replace(c,"")

Among all these approaches, `translate()` method beats the others in terms of speed. please refer to **[this post](https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python)** on StackOverflow. But please note that the syntax mentioned for `translate()` in that post is applicable in Python 2. For Python 3, please refer to **[this post](https://stackoverflow.com/questions/23175809/str-translate-gives-typeerror-translate-takes-one-argument-2-given-worked-i)**.

In [7]:
#removing the punctuations.
import string
submissions['title_nopuncs']=submissions['headline'].apply(lambda x: x.translate(str.maketrans('','',string.punctuation)))

Also, we should lower case the titles. Apple, apple, and APPLE are all the same!

In [8]:
#lower casing titles
submissions['title_nopuncs']=submissions['title_nopuncs'].apply(lambda x: x.lower())

Now, we'd like to to tokenize the titles. I use `split()` function. One could use `nltk.tokenize` as well. Based on **[this post](https://stackoverflow.com/questions/9602856/most-efficient-way-to-split-strings-in-python)**, `split()` works fairly good on not too long strings.

In [9]:
# tokenizing the headlines
submissions['tokenz'] = submissions['title_nopuncs'].apply(lambda x: x.split())
submissions['tokenz'].head()

0     [8, ways, to, use, docker, in, the, real, world]
1    [software, sadly, we, did, adopt, from, the, c...
2    [google’s, stock, split, means, more, control,...
3    [ssl, dos, attack, tool, released, exploiting,...
4    [immutability, and, blocks, lambdas, and, clos...
Name: tokenz, dtype: object

Now, we should use find unique tokens. I can think of two approaches:
* creates a master list of all the tokenz, and call unique() function on it.
* create an emppty list, and append the unique tokenz to it. **Don't do this! It takes forever! Obviously.**


**OR** use **[this](https://stackoverflow.com/questions/1720421/how-to-concatenate-two-lists-in-python)** awesome post on StackOverflow and find the following approach!

In [10]:
import itertools

#this will create a list of all words
words=list(itertools.chain.from_iterable(submissions['tokenz']))

#this will create a list of unique words
unique_words=list(set(words))

print('Number of unique words:',len(set(unique_words)))

Number of unique words: 6864


In [11]:
submissions['tokenz'].head()

0     [8, ways, to, use, docker, in, the, real, world]
1    [software, sadly, we, did, adopt, from, the, c...
2    [google’s, stock, split, means, more, control,...
3    [ssl, dos, attack, tool, released, exploiting,...
4    [immutability, and, blocks, lambdas, and, clos...
Name: tokenz, dtype: object

In [12]:
import numpy as np
import pandas as pd
counts = pd.DataFrame(0,index=np.arange(submissions['tokenz'].shape[0]), columns=unique_words)

In [None]:
#now counting the number of words in each headline and adding it to our dataframe
for index, row in submissions.iterrows():
    #print(row['tokenz'])
    for token in row['tokenz']:
        #print (token)
        counts.iloc[index][token]+=1

Interestingly, we could use the `sklearn.feature_extraction` that does all the steps that we have just implemented!

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X=vectorizer.fit_transform(list(submissions['headline']))
counts=pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())
#print( vectorizer.vocabulary_)

Too many columns. There are two types of features that will reduce regression accuracy:
* The ones that occur only a few times. These will cause over fitting.
* The ones that occur too many times, such as `a` and `and`. These are often called `stopwords`, and do not indicate any relationship with the upvotes.  
Let's remove any word that occur fewer than 5 and more than 100 times.

In [31]:
count_sum=counts.sum()
counts=counts.drop(count_sum[(count_sum>100) | (count_sum<5)].index,axis=1)

In [32]:
counts.shape

(2801, 680)

In [33]:
# spliting data into train and validation sets
from sklearn.cross_validation import train_test_split

X_train,X_test,y_train,y_test=train_test_split(counts,submissions['upvotes'],train_size=0.8,random_state=1)

In [34]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lr = LinearRegression()
lr.fit(X_train,y_train)
pred=lr.predict(X_test)
rmse=(mean_squared_error(pred,y_test))**0.5
rmse

47.540588945715584