## List all available csv files

---

In [13]:
import os
from os import listdir
from os.path import isfile, join

def get_all_tags_filenames(folder_path = '../data/'):
    '''
    Get all csv file names
    '''
    os.chdir(folder_path)
    return [f for f in listdir(folder_path) if isfile(join(folder_path, f)) and f.endswith(".csv")]

In [42]:
get_all_tags_filenames()

['uwp.csv',
 'magento.csv',
 'assembly.csv',
 'jestjs.csv',
 'hive.csv',
 'flutter.csv',
 'list.csv',
 'mobile.csv',
 'batch-file.csv',
 'animation.csv',
 'oracle11g.csv',
 'rxjs.csv',
 'wcf.csv',
 'gwt.csv',
 'lambda.csv',
 'asp.net-mvc.csv',
 'dependency-injection.csv',
 'sqlite.csv',
 'database.csv',
 'scroll.csv',
 'visual-studio-2012.csv',
 'c#.csv',
 'opengl.csv',
 'visual-studio-2015.csv',
 'version-control.csv',
 'neo4j.csv',
 'callback.csv',
 'mongodb.csv',
 'drupal.csv',
 'dart.csv',
 'uiviewcontroller.csv',
 'types.csv',
 'maven.csv',
 'java.csv',
 'ionic-framework.csv',
 'cmd.csv',
 'go.csv',
 'reflection.csv',
 'asp.net-mvc-5.csv',
 'uitableview.csv',
 'path.csv',
 'datatable.csv',
 'grails.csv',
 'symfony.csv',
 'django-rest-framework.csv',
 'vector.csv',
 'autocomplete.csv',
 'redis.csv',
 'ruby-on-rails.csv',
 'deployment.csv',
 'xaml.csv',
 'merge.csv',
 'mod-rewrite.csv',
 'spring-security.csv',
 'math.csv',
 'svn.csv',
 'ruby-on-rails-4.csv',
 'image-processing.csv',

## Read a csv file as data frame

---

In [39]:
import pandas as pd
import datetime as dt

def get_tag_df(file_name, folder_path = '../data/'):
    tag_data = pd.read_csv(join(folder_path, file_name), sep=':')
    # Drop redundant index column
    tag_data.dropna(subset = ["Unnamed: 0"], inplace=True)
    tag_data.drop(columns=['Unnamed: 0'], inplace=True)
    # Update index
    tag_data.reset_index(drop=True, inplace=True)
    # Set data types
    tag_data['date'] = pd.to_datetime(tag_data['date'])
    tag_data['id'] = tag_data['id'].astype(int)
    tag_data['vote'] = tag_data['vote'].astype(int)
    tag_data['answer'] = tag_data['answer'].astype(int)
    tag_data['views'] = tag_data['views'].astype(int)
    tag_data['accepted'] = tag_data['accepted'].apply(lambda value: bool(value == 'True')).astype(bool)
    return tag_data

In [44]:
get_tag_df('cakephp.csv')

Unnamed: 0,tag,id,vote,answer,views,accepted,date
0,cakephp,69649623,0,1,12,False,2021-10-20
1,cakephp,69637611,0,0,28,False,2021-10-19
2,cakephp,69633927,0,0,42,False,2021-10-19
3,cakephp,69633433,0,0,20,False,2021-10-19
4,cakephp,69628876,0,1,29,False,2021-10-19
...,...,...,...,...,...,...,...
4995,cakephp,40028327,0,2,589,False,2016-10-13
4996,cakephp,40026574,0,1,384,True,2016-10-13
4997,cakephp,40026349,0,0,56,False,2016-10-13
4998,cakephp,40023269,0,1,46,False,2016-10-13


## Example of joining several tables

---

In [46]:
beautifulsoup = get_tag_df('beautifulsoup.csv')
lxml = get_tag_df('lxml.csv')
selenium = get_tag_df('selenium.csv')
scrapy = get_tag_df('scrapy.csv')
mechanicalsoup = get_tag_df('mechanicalsoup.csv')
urllib = get_tag_df('urllib.csv')
urllib2 = get_tag_df('urllib2.csv')

scrapy_libs = [beautifulsoup, lxml, selenium, scrapy, mechanicalsoup, urllib, urllib2]

In [49]:
scrapy_df = pd.concat(scrapy_libs)

In [50]:
scrapy_df

Unnamed: 0,tag,id,vote,answer,views,accepted,date
0,beautifulsoup,69651927,0,1,14,False,2021-10-20
1,beautifulsoup,69651870,0,0,9,False,2021-10-20
2,beautifulsoup,69648558,0,0,14,False,2021-10-20
3,beautifulsoup,69646624,0,2,43,False,2021-10-20
4,beautifulsoup,69646378,-1,0,24,False,2021-10-20
...,...,...,...,...,...,...,...
2803,urllib2,163603,6,2,6000,True,2008-10-02
2804,urllib2,163009,31,14,40000,True,2008-10-02
2805,urllib2,151929,6,3,6000,True,2008-09-30
2806,urllib2,148853,13,7,9000,True,2008-09-29


In [58]:
# Form pivot table to see which tags of interest are assigned to every question 

questions_df = scrapy_df.pivot_table(
    values='views',
    index=['id'],
    columns='tag',
    aggfunc='median',
    fill_value=0
)

In [59]:
questions_df[questions_df.apply(lambda x: x['beautifulsoup'] > 0 and x['beautifulsoup'] < sum(x[:]), axis=1)]

tag,beautifulsoup,lxml,mechanicalsoup,scrapy,selenium,urllib,urllib2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
64049423,50,50,0,0,0,0,0
64066003,129,129,0,0,0,0,0
64085429,91,0,0,91,0,0,0
64173263,47,47,0,0,0,0,0
64200882,139,140,0,0,0,0,0
...,...,...,...,...,...,...,...
69600452,32,0,0,0,33,0,0
69617685,57,0,0,0,0,57,0
69632674,16,0,0,0,17,0,0
69633732,26,0,0,0,26,0,0
