### Kaggle datasets: Stack Overflow python questions analysis

In [34]:
from __future__ import division
from collections import Counter, defaultdict
import math
import os
import re
import string

from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Disable pandas SettingWithCopyWarning
pd.options.mode.chained_assignment = None

work_dir = os.environ['WORK_DIR']

# Load data (Questions.csv downloaded from https://www.kaggle.com/stackoverflow/pythonquestions)
df = pd.read_csv(os.path.join(work_dir, 'Questions.csv'))
df.head()


Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...
1,502,147.0,2008-08-02T17:01:58Z,27,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...
2,535,154.0,2008-08-02T18:43:54Z,40,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...
3,594,116.0,2008-08-03T01:15:08Z,25,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...
4,683,199.0,2008-08-03T13:19:16Z,28,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...


In [35]:
# Limit size

print df.size
df_frac = df.sample(frac=.25)
print df_frac.size
del df_frac['Body']

3643692
910926


In [36]:
# Clean strings - remove stopwords, punctuation, make uppercase

stopwords_en = stopwords.words('english')

def remove_stopwords(row):
    if row.Title:
        return ' '.join([word for word in row.Title.split() if word not in stopwords_en])

def remove_non_letters(row):
    regex = re.compile('[^a-zA-Z]')
    if row.Title:       
        return ' '.join([regex.sub('', word) for word in row.Title.split()])

def make_lowercase(row):
    if row.Title:
        return ' '.join([word.lower() for word in row.Title.split()])

def get_bag_of_words(row):
    if row.Title:
        return Counter(row.Title.split())

df_frac['Title'] = df_frac.apply(remove_stopwords, axis=1)
df_frac['Title'] = df_frac.apply(remove_non_letters, axis=1)
df_frac['Title'] = df_frac.apply(make_lowercase, axis=1)
df_frac['TitleBagOfWords'] = df_frac.apply(get_bag_of_words, axis=1)
df_frac.dropna(subset=['Title'], inplace=True)
df_frac.head()



Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,TitleBagOfWords
491275,35213176,5741225.0,2016-02-04T22:47:50Z,1,python while loops tuples,"{u'python': 1, u'loops': 1, u'while': 1, u'tup..."
418452,31742099,2899150.0,2015-07-31T08:58:18Z,0,cannot install kiviy mingw,"{u'mingw': 1, u'cannot': 1, u'kiviy': 1, u'ins..."
352473,28244921,1814314.0,2015-01-30T21:28:55Z,2,how i get calling expression function python,"{u'function': 1, u'get': 1, u'i': 1, u'calling..."
603015,39987860,5522848.0,2016-10-11T22:38:13Z,3,update pandas cells based column values other ...,"{u'based': 1, u'column': 1, u'cells': 1, u'upd..."
15810,2131417,84478.0,2010-01-25T09:57:36Z,4,using python like php apachewindows,"{u'python': 1, u'using': 1, u'php': 1, u'like'..."


In [37]:
# Compute tf-idf (term frequency - inverse document frequency)

df = df_frac.copy()
DOCUMENTS_COUNT = df.shape[0]

# get document frequencies
global_counter = defaultdict(int)
for title in df.TitleBagOfWords:
    for word, count in title.iteritems():
        global_counter[word] += 1
    
def compute_tf_idf(row):
    tf_idf = {}
    for word, count in row.TitleBagOfWords.iteritems():
        tf = count / len(row['Title'])
        idf = math.log(DOCUMENTS_COUNT) / global_counter[word]
        tf_idf[word] = tf * idf
    row['tf-idf'] = tf_idf
    return row

df = df.apply(compute_tf_idf, axis=1)
df.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,TitleBagOfWords,tf-idf
491275,35213176,5741225.0,2016-02-04T22:47:50Z,1,python while loops tuples,"{u'python': 1, u'loops': 1, u'while': 1, u'tup...","{u'python': 7.91446173063e-06, u'loops': 0.001..."
418452,31742099,2899150.0,2015-07-31T08:58:18Z,0,cannot install kiviy mingw,"{u'mingw': 1, u'cannot': 1, u'kiviy': 1, u'ins...","{u'cannot': 0.000367974137138, u'mingw': 0.020..."
352473,28244921,1814314.0,2015-01-30T21:28:55Z,2,how i get calling expression function python,"{u'function': 1, u'get': 1, u'i': 1, u'calling...","{u'function': 5.37349902456e-05, u'how': 1.165..."
603015,39987860,5522848.0,2016-10-11T22:38:13Z,3,update pandas cells based column values other ...,"{u'based': 1, u'column': 1, u'cells': 1, u'upd...","{u'based': 0.000162646655501, u'column': 0.000..."
15810,2131417,84478.0,2010-01-25T09:57:36Z,4,using python like php apachewindows,"{u'python': 1, u'using': 1, u'php': 1, u'like'...","{u'python': 5.65318695045e-06, u'using': 2.634..."


In [38]:
# Get top 50 words

def get_top_word(row):
    return max(row['tf-idf'], key=row['tf-idf'].get)

df['top_word'] = df.apply(get_top_word, axis=1)

top_words_counter = defaultdict(int)
for word in df.top_word:
    top_words_counter[word] += 1
    
top_50 = Counter(top_words_counter).most_common(50)
for word, count in top_50:
    print '{} count: {}'.format(word, count)

append count: 118
compare count: 113
callable count: 112
tuple count: 106
iterating count: 99
iterate count: 99
sorting count: 96
appending count: 96
txt count: 95
comprehension count: 93
split count: 92
sort count: 92
assign count: 92
dictionaries count: 91
replace count: 90
tuples count: 89
merge count: 88
looping count: 88
formatting count: 87
substring count: 85
concatenate count: 84
passing count: 84
comparing count: 84
xpath count: 82
indices count: 79
replacing count: 79
extracting count: 77
mysqldb count: 76
insert count: 76
indexerror count: 75
searching count: 75
filtering count: 73
loops count: 73
accessing count: 73
beautiful count: 73
iterable count: 73
writing count: 73
lxml count: 72
pattern count: 72
parsing count: 72
scraping count: 71
php count: 71
indexing count: 71
javascript count: 71
assigning count: 71
printing count: 71
extract count: 71
inserting count: 70
splitting count: 70
combine count: 70
