In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, PowerTransformer, MinMaxScaler
pd.set_option('max_colwidth', 1000)

# Part 1

In [2]:
# read file
listings_df = pd.read_csv('data/midterm1-data/listings.csv')
listings_df.head()

Unnamed: 0,id,address,num_rooms,available
0,7332.0,"369 Barbara Square\nWhiteberg, MT 64361",__4,Y
1,,"720 Stephen Garden Apt. 138\nSouth Haroldland, IA 96057",4,yes
2,7023.0,"PSC 9388, Box 5285\nAPO AA 33803",3,n
3,2024.0,"93507 Wells Brook Suite 898\nScottfort, IL 03739",(5),Y
4,6396.0,"35641 Ashley Burg Suite 422\nScottview, SD 80942",__6,no


In [3]:
# clean 'available' column
listings_df['available'] = listings_df['available'].replace(['y', 'Y'], 'yes')
listings_df['available'] = listings_df['available'].replace(['n', 'NO'], 'no')
listings_df['available'].unique()

array(['yes', 'no'], dtype=object)

In [4]:
# remove non-difit form num_rooms
listings_df['num_rooms'] = listings_df['num_rooms'].str.extract('(\d+)')
listings_df['num_rooms'] = listings_df['num_rooms'].astype(int)
listings_df['num_rooms'].unique()

array([4, 3, 5, 6, 2, 1])

# Part 2

In [5]:
hackers_news_df = pd.read_csv('data/midterm1-data/hacker_news.csv')
hackers_news_df.head()

Unnamed: 0,id,title,url,num_points,num_comments,author,created_at
0,12224879,Interactive Dynamic Video,http://www.interactivedynamicvideo.com/,386,52,ne0phyte,8/4/2016 11:52
1,11964716,Florida DJs May Face Felony for April Fools' Water Joke,http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/,2,1,vezycash,6/23/2016 22:20
2,11919867,Technology ventures: From Idea to Enterprise,https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429,3,1,hswarna,6/17/2016 0:01
3,10301696,Note by Note: The Making of Steinway L1037 (2007),http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0,8,2,walterbell,9/30/2015 4:12
4,10482257,Title II kills investment? Comcast and other ISPs are now spending more,http://arstechnica.com/business/2015/10/comcast-and-other-isps-boost-network-investment-despite-net-neutrality/,53,22,Deinos,10/31/2015 9:48


In [6]:
# sum number of row mention sql in title 
hackers_news_df['title'].str.contains('sql', flags=re.IGNORECASE).sum()


108

In [7]:
# create hn_sql contains only row mention sql in 'title'
hn_sql = hackers_news_df[hackers_news_df['title'].str.contains('sql', flags=re.IGNORECASE)]
hn_sql.head()

Unnamed: 0,id,title,url,num_points,num_comments,author,created_at
9,11370829,Crate raises $4M seed round for its next-gen SQL database,http://techcrunch.com/2016/03/15/crate-raises-4m-seed-round-for-its-next-gen-sql-database/,3,1,hitekker,3/27/2016 18:08
142,10957172,PostgreSQL: Linux VS Windows part 2,http://www.sqig.net/2016/01/postgresql-linux-vs-windows-part-2.html,16,3,based2,1/23/2016 4:21
221,11544342,MemSQL (YC W11) Raises $36M Series C,http://blog.memsql.com/memsql-raises-series-c/,74,14,ericfrenkiel,4/21/2016 18:32
394,10620525,The History of SQL Injection,http://motherboard.vice.com/read/the-history-of-sql-injection-the-hack-that-will-never-go-away,38,9,kawera,11/24/2015 13:25
419,10301554,Pentesterlab Tutorial SQL injection to web admin console to getting a shell,https://pentesterlab.com/exercises/from_sqli_to_shell,2,1,pentestercrab,9/30/2015 3:32


In [8]:
hn_sql['title'].str.extractall(r'(\w*SQL\w*)', flags=re.IGNORECASE)

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Unnamed: 0_level_1,match,Unnamed: 2_level_1
9,0,SQL
142,0,PostgreSQL
221,0,MemSQL
394,0,SQL
419,0,SQL
...,...,...
19133,0,PostgreSQL
19580,0,PostgreSQL
19769,0,SQL
19802,0,PostgreSQL


In [9]:
# extract sql flavor not case sensitive
hn_sql['flavor'] = hn_sql['title'].str.extract(r'(\w+SQL)', flags=re.IGNORECASE)
hn_sql['flavor'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hn_sql['flavor'] = hn_sql['title'].str.extract(r'(\w+SQL)', flags=re.IGNORECASE)


array([nan, 'PostgreSQL', 'MemSQL', 'NoSQL', 'MySQL', 'SparkSQL', 'nosql',
       'mySql', 'CloudSQL'], dtype=object)

In [10]:
# convert flavour to lowercase
hn_sql['flavor'] = hn_sql['flavor'].str.lower()
hn_sql['flavor'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hn_sql['flavor'] = hn_sql['flavor'].str.lower()


array([nan, 'postgresql', 'memsql', 'nosql', 'mysql', 'sparksql',
       'cloudsql'], dtype=object)

In [11]:
# create pivot table index=flavour, value=num_comments, aggfunc=mean
hn_sql_pivot = pd.pivot_table(hn_sql, index='flavor', values='num_comments', aggfunc='mean')
hn_sql_pivot

Unnamed: 0_level_0,num_comments
flavor,Unnamed: 1_level_1
cloudsql,5.0
memsql,14.0
mysql,12.230769
nosql,14.529412
postgresql,25.962963
sparksql,1.0


In [12]:
# extract protocol, domain, path
pattern = r"(?P<protocol>.+)://(?P<domain>[\w\-\.]+)(?P<path>.*)"
#catch those pattern in the url column and extract the domain
url_parts = hackers_news_df['url'].str.extract(pattern, flags=re.IGNORECASE)
url_parts.tail()


Unnamed: 0,protocol,domain,path
20094,https,puri.sm,/philosophy/how-purism-avoids-intels-active-management-technology/
20095,https,medium.com,/@zreitano/the-yc-application-broken-down-and-translated-e4c0f5235081
20096,http,blog.darknedgy.net,/technology/2016/01/01/0/
20097,https,medium.com,/@benjiwheeler/how-product-hunt-really-works-d8fdcda1da74
20098,https,github.com,/jmcarp/robobrowser


# Part 3

In [13]:
# extract title start with 'ask hn'
ask_hn = hackers_news_df[hackers_news_df['title'].str.startswith('Ask HN')]
ask_hn.head()


Unnamed: 0,id,title,url,num_points,num_comments,author,created_at
6,12296411,Ask HN: How to improve my personal website?,,2,6,ahmedbaracat,8/16/2016 9:55
16,10610020,Ask HN: Am I the only one outraged by Twitter shutting down share counts?,,28,29,tkfx,11/22/2015 13:43
21,11610310,Ask HN: Aby recent changes to CSS that broke mobile?,,1,1,polskibus,5/2/2016 10:14
29,12210105,Ask HN: Looking for Employee #3 How do I do it?,,1,3,sph130,8/2/2016 14:20
30,10394168,Ask HN: Someone offered to buy my browser extension from me. What now?,,28,17,roykolak,10/15/2015 16:38


In [14]:
# extract title start with 'show hn'
show_hn = hackers_news_df[hackers_news_df['title'].str.startswith('Show HN')]
show_hn.head()

Unnamed: 0,id,title,url,num_points,num_comments,author,created_at
12,10627194,Show HN: Wio Link ESP8266 Based Web of Things Hardware Development Platform,https://iot.seeed.cc,26,22,kfihihc,11/25/2015 14:03
38,10646440,Show HN: Something pointless I made,http://dn.ht/picklecat/,747,102,dhotson,11/29/2015 22:46
45,11590768,"Show HN: Shanhu.io, a programming playground powered by e8vm",https://shanhu.io,1,1,h8liu,4/28/2016 18:05
83,12178806,Show HN: Webscope Easy way for web developers to communicate with Clients,http://webscopeapp.com,3,3,fastbrick,7/28/2016 7:11
96,10872799,Show HN: GeoScreenshot Easily test Geo-IP based web pages,https://www.geoscreenshot.com/,1,9,kpsychwave,1/9/2016 20:45


In [15]:
# Do 'Ask HN' or 'Show HN' receive more comments on average?
print('Ask HN: ', ask_hn['num_comments'].mean(), 'Show HN: ', show_hn['num_comments'].mean())
ask_hn['num_comments'].mean() == show_hn['num_comments'].mean()

Ask HN:  14.044776119402986 Show HN:  10.324720068906116


False

In [16]:
# Findings the number of ask post and comment by hour created
ask_hn['created_at'] = pd.to_datetime(ask_hn['created_at'])
ask_hn['hour_created'] = ask_hn['created_at'].dt.hour
ask_hn.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ask_hn['created_at'] = pd.to_datetime(ask_hn['created_at'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ask_hn['hour_created'] = ask_hn['created_at'].dt.hour


Unnamed: 0,id,title,url,num_points,num_comments,author,created_at,hour_created
6,12296411,Ask HN: How to improve my personal website?,,2,6,ahmedbaracat,2016-08-16 09:55:00,9
16,10610020,Ask HN: Am I the only one outraged by Twitter shutting down share counts?,,28,29,tkfx,2015-11-22 13:43:00,13
21,11610310,Ask HN: Aby recent changes to CSS that broke mobile?,,1,1,polskibus,2016-05-02 10:14:00,10
29,12210105,Ask HN: Looking for Employee #3 How do I do it?,,1,3,sph130,2016-08-02 14:20:00,14
30,10394168,Ask HN: Someone offered to buy my browser extension from me. What now?,,28,17,roykolak,2015-10-15 16:38:00,16


In [17]:
pivot_ask_hn = pd.pivot_table(ask_hn, index='hour_created', values='num_comments', aggfunc='sum')
pivot_ask_hn

Unnamed: 0_level_0,num_comments
hour_created,Unnamed: 1_level_1
0,439
1,683
2,1381
3,421
4,337
5,464
6,397
7,267
8,492
9,251


# Part 4

In [18]:
revenue_df = pd.read_csv('data/midterm1-data/revenue.csv')
revenue_df

Unnamed: 0,CustomerID,Revenue,date
0,R1004,50,2021-12-01
1,W1002,5000,2021-09-01
2,R1002,150,2021-08-01
3,W1001,2000,2021-05-01
4,W1001,2200,2021-05-01
5,R1001,100,2021-04-01
6,R1001,30,2020-12-01
7,R1003,270,2020-09-01


In [19]:
# delete repeated id each month, keep row with larger revenue
revenue_df = revenue_df.sort_values(['date', 'Revenue'], ascending=False)
revenue_df = revenue_df.drop_duplicates(subset='CustomerID', keep='first')
revenue_df


Unnamed: 0,CustomerID,Revenue,date
0,R1004,50,2021-12-01
1,W1002,5000,2021-09-01
2,R1002,150,2021-08-01
4,W1001,2200,2021-05-01
5,R1001,100,2021-04-01
7,R1003,270,2020-09-01


In [20]:
# earliest revenue of each customer
earliest_revenue = revenue_df.sort_values(['CustomerID', 'date'], ascending=True)
earliest_revenue = earliest_revenue.drop_duplicates(subset='CustomerID', keep='first')
earliest_revenue

Unnamed: 0,CustomerID,Revenue,date
5,R1001,100,2021-04-01
2,R1002,150,2021-08-01
7,R1003,270,2020-09-01
0,R1004,50,2021-12-01
4,W1001,2200,2021-05-01
1,W1002,5000,2021-09-01


# Part 5

In [21]:
bank_df = pd.read_csv('data/midterm1-data/bank-additional-full.csv', sep=';')
bank_df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [22]:
bank_df['month'].unique()

array(['may', 'jun', 'jul', 'aug', 'oct', 'nov', 'dec', 'mar', 'apr',
       'sep'], dtype=object)

In [23]:
# encode month jan: 0, feb: 1, ...
def map_month(x):
    month_dict = {'jan': 0, 'feb': 1, 'mar': 2, 'apr': 3, 'may': 4, 'jun': 5, 
                  'jul': 6, 'aug': 7, 'sep': 8, 'oct': 9, 'nov': 10, 'dec': 11}
    return month_dict[x]

bank_df['enc_month1'] = bank_df['month'].apply(lambda x: map_month(x))
bank_df['enc_month1'].unique()


array([ 4,  5,  6,  7,  9, 10, 11,  2,  3,  8])

In [24]:
# encode using encoder
encoder = OrdinalEncoder(categories=[['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']])
bank_df['enc_month2'] = encoder.fit_transform(np.array(bank_df['month']).reshape(-1, 1))
bank_df.head()
bank_df['enc_month2'].unique()

array([ 4.,  5.,  6.,  7.,  9., 10., 11.,  2.,  3.,  8.])

In [25]:
res = bank_df['enc_month1'] == bank_df['enc_month2']
res.value_counts()

True    41188
Name: count, dtype: int64

In [26]:
transformer = PowerTransformer(method='yeo-johnson', standardize=False)
bank_df['duration'] = transformer.fit_transform(np.array(bank_df['duration']).reshape(-1, 1))
bank_df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y,enc_month1,enc_month2
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,4,4.0
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,4,4.0
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,4,4.0
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,4,4.0
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,4,4.0


In [27]:
scaler = MinMaxScaler()
bank_df['duration_T'] = scaler.fit_transform(np.array(bank_df['duration']).reshape(-1, 1))
bank_df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y,enc_month1,enc_month2,duration_T
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,4,4.0,0.530975
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,4,4.0,0.459655
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,4,4.0,0.512160
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,4,4.0,0.461292
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,4,4.0,0.552608
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes,10,10.0,0.564020
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no,10,10.0,0.582818
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no,10,10.0,0.489278
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes,10,10.0,0.602846
