## Data Cleaning 

In [1]:
# import librariesy 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import re
%matplotlib inline

In [2]:
raw_df = pd.read_csv('~/Downloads/stack-overflow-bqresults.csv')

In [3]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 4 columns):
id       250000 non-null int64
title    80283 non-null object
body     250000 non-null object
tags     80284 non-null object
dtypes: int64(1), object(3)
memory usage: 7.6+ MB


In [4]:
# 80284 non-null types and title out of 250000 entries
# remove observations without tags or title 
raw_df.dropna(inplace=True)

In [5]:
# make remove html tags from body
raw_df['body'] = raw_df.body.apply(lambda x: BeautifulSoup(x).get_text())

In [7]:
raw_df.body[1]

'Here is a piece of C++ code that seems very peculiar. For some strange reason sorting the data miraculously makes the code almost six times faster. #include <algorithm> #include <ctime> #include <iostream> int main() { // Generate data const unsigned arraySize = 32768; int data[arraySize]; for (unsigned c = 0; c < arraySize; ++c) data[c] = std::rand() % 256; // !!! With this the next loop runs faster std::sort(data data + arraySize); // Test clock_t start = clock(); long long sum = 0; for (unsigned i = 0; i < 100000; ++i) { // Primary loop for (unsigned c = 0; c < arraySize; ++c) { if (data[c] >= 128) sum += data[c]; } } double elapsedTime = static_cast<double>(clock() - start) / CLOCKS_PER_SEC; std::cout << elapsedTime << std::endl; std::cout << sum = << sum << std::endl; }   Without std::sort(data data + arraySize); the code runs in 11.54 seconds. With the sorted data the code runs in 1.93 seconds.  Initially I thought this might be just a language or compiler anomaly. So I tried it

In [8]:
def check_topic(L1, L2):
    for l1 in L1:
        if l1 in L2:
            return True
    return False

In [9]:
# step one:
# change anyting has git in it as version control topic 
raw_df['topic'] = raw_df.tags.apply(lambda x: 'version_control' if re.search(
    '(^git|git$)', x) and 'digit' not in x else x)

In [10]:
# might be better to convert the topic into a list 
raw_df.topic = raw_df.topic.str.split('|')

In [11]:
# step two:
# change anyting has javascript as web development
web_frontend = ['javascript', 'jquery',
             'html', 'css']
raw_df['topic'] = raw_df.topic.apply(
    lambda x: ['web_frontend'] if check_topic(web_frontend, x) else x)

In [12]:
# change anyting contains android as android topic
raw_df['topic'] =raw_df.topic.apply(
    lambda x: ['android_related'] if check_topic(['android'],x) else x)

In [18]:
raw_df.topic.apply(lambda x: '|'.join(x)).value_counts()[:10]

web_frontend       11638
android_related     5663
version_control     2382
python               401
java                 286
c#                   217
php                  216
angularjs            195
c#|.net              175
vim                  172
Name: topic, dtype: int64

In [None]:
# bring python topic into next category 

In [None]:
# bring java into next category 

In [None]:
# bring c# into next category

In [None]:
# step three:
# change anything that has 

In [23]:
raw_df[raw_df.topic.apply(lambda x: '|'.join(x)).str.contains('android')]

Unnamed: 0,id,title,body,tags,topic
84,2025282,Difference between px dp dip and sp on Android?,What is the difference between: px dip dp sp ...,android|android-layout|user-interface|units-of...,"[android, android-layout, user-interface, unit..."
204,1554099,Why is the Android emulator so slow? How can w...,I have a 2.67 GHz Celeron processor and 1.21 G...,android|performance|android-emulator|qemu,"[android, performance, android-emulator, qemu]"
221,13375357,Proper use cases for Android UserManager.isUse...,I was looking at the new APIs introduced in An...,java|android|usermanager,"[java, android, usermanager]"
286,1109022,Close/hide the Android Soft Keyboard,I have an EditText and a Button in my layout. ...,android|android-softkeyboard|android-keypad|an...,"[android, android-softkeyboard, android-keypad..."
449,151777,Saving Android Activity state,I've been playing around with the Android SDK ...,android|android-activity|application-state,"[android, android-activity, application-state]"
470,1555109,Stop EditText from gaining focus at Activity s...,I have an Android Activity with two elements: ...,android|android-listview|android-edittext,"[android, android-listview, android-edittext]"
482,2194808,Debug certificate expired error in Eclipse And...,I am using Eclipse Android plugins to build a ...,android|eclipse|certificate,"[android, eclipse, certificate]"
493,2785485,Is there a unique Android device ID?,Do Android devices have a unique ID and if so ...,java|android|uniqueidentifier,"[java, android, uniqueidentifier]"
665,541966,Lazy load of images in ListView,I am using a ListView to display some images a...,android|image|url|android-listview|universal-i...,"[android, image, url, android-listview, univer..."
681,101754,Is there a way to run Python on Android?,We are working on an S60 version and this plat...,python|android|ase|android-scripting,"[python, android, ase, android-scripting]"


In [None]:
# make sure to not double count, only replace things have git as git 

In [84]:
raw_df.tags = raw_df.tags.str.split('|')

In [87]:
# replace anything match git related as the word git 
raw_df.tags.apply(lambda tag: set('git' if re.match('(^git|git$)',x) else x for x in tag))

1         {branch-prediction, optimization, c++, java, p...
5                                                     {git}
11                                             {amend, git}
12                                                    {git}
21                                     {content-type, json}
                                ...                        
249971                         {navigation-drawer, android}
249972                            {stdin, argparse, python}
249973                             {backend, java, logging}
249974                 {javascript, attributes, dom, d3.js}
249975                              {django-models, django}
Name: tags, Length: 80283, dtype: object

In [60]:
temp_df = pd.DataFrame(pd.concat([pd.Series(row['id'], row['tags'].split('|'))
           for _, row in raw_df.iterrows()])).reset_index()

In [61]:
temp_df.rename(columns = {'index':'tags',0:'id'}, inplace = True)

In [62]:
split_df = pd.merge(temp_df, raw_df,on='id')

In [63]:
split_df.drop(columns = ['tags_y'], inplace=True)

In [64]:
split_df.rename(columns={'tags_x':'tags'}, inplace = True)

In [65]:
split_df.head()

Unnamed: 0,tags,id,title,body
0,java,11227809,Why is processing a sorted array faster than a...,Here is a piece of C++ code that seems very pe...
1,c++,11227809,Why is processing a sorted array faster than a...,Here is a piece of C++ code that seems very pe...
2,performance,11227809,Why is processing a sorted array faster than a...,Here is a piece of C++ code that seems very pe...
3,optimization,11227809,Why is processing a sorted array faster than a...,Here is a piece of C++ code that seems very pe...
4,branch-prediction,11227809,Why is processing a sorted array faster than a...,Here is a piece of C++ code that seems very pe...


# should be able to merge a few tags and use common sense to make them into the same tag category
