# Pandas Practice
### Practice Pandas using questions from online

In [1]:
# Q1
import pandas as pd
import numpy as np

## Transform objects to series
##### `np.arange(start,stop,step)`  has three parameters normally
##### When there is only one parameter, default start with `0` and step is `1` stop till `parameter - 1` 


In [4]:
mylist = list('Han')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))

myarr1 = np.array(mylist)
myarr2 = np.array(myarr)
myarr3 = np.array(mydict)

ser1 = pd.Series(mylist)
ser2 = pd.Series(myarr)
ser3 = pd.Series(mydict)

## Transform series to dataframe

#### `zip(a,b)` can turn `a` and `b` into tuple but `a` and `b` must be iteratable.
#### `DataFrame.reset_index ()`  can turn the original index into column.

In [16]:
mylist = list('Han')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser = pd.Series(mydict)

df1 = ser.to_frame()
df1 = df1.reset_index()
df1

Unnamed: 0,index,0
0,H,0
1,a,1
2,n,2


## Combine two series to form a dataframe
#### `pd.Series(list)` will break down list by elements.

In [22]:
import numpy as np
ser1 = pd.Series(list('Han'))
ser2 = pd.Series(np.arange(3))

# solution 1
dfQ4_1 = pd.DataFrame({'col1':ser1,'col2':ser2})

# solution 2
dfQ4_2 = pd.concat([ser1,ser2],axis=1)
dfQ4_2

Unnamed: 0,0,1
0,H,0
1,a,1
2,n,2


## Assign name to the series's index

In [24]:
ser = pd.Series(list('Han'))

ser.name="ser_name"
ser.name

'ser_name'

## Get the items of series A not present in series B
##### `~`can reverse the result, which is turn `True` into `False` and `False` into `True`.
##### `ser1.isin(ser2)`  return a list containing `True` and `False` for each position in ser1.

In [28]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

ser1[~ser1.isin(ser2)]
ser1.isin(ser2)

0    False
1    False
2    False
3     True
4     True
5    False
6    False
dtype: bool

## Get the items not common to both series A and series B
##### `np.union1d(ser1,ser2)` find union of two groups
#### `np.intersect1d(ser1,ser2)` find intersection of two groups

In [29]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

ser_u = pd.Series(np.union1d(ser1,ser2))
ser_i = pd.Series(np.intersect1d(ser1,ser2))
ser_u[~ser_u.isin(ser_i)]

0    1
1    2
2    3
5    6
6    7
7    8
dtype: int64

In [30]:
dfQ7 = pd.DataFrame({'col1':[1,2,3,4,5],"col2":[3,4,5,6,7]})
dfQ7["col1"][~dfQ7["col1"].isin(dfQ7["col2"])]

0    1
1    2
Name: col1, dtype: int64

## The minimum, 25th percentile, median, 75th, and max of a numeric series

#### `np.random.normal(loc, scale, size)` generate Gaussian distribution numbers
#### `np.percentile(Series, percentage)` can find the percentile for a series of numbers.

In [51]:
serQ8 = pd.Series(np.random.normal(10, 5, 25))
np.percentile(serQ8,[0,25,50,75,100])

array([ 2.79446817,  7.40264008, 10.18758552, 14.01731645, 18.88070387])

## Dataframe solution

In [60]:
dfQ8 = pd.DataFrame({'col1':[1,2,3,4,5,6,7,8,9],'col2':[10,20,30,40,50,60,70,80,90]})
np.percentile(dfQ8["col1"],100)

numpy.float64

## Get frequency counts of unique items of a series
##### `np.take()` take out the elements one by one.
##### if a is a array, `a.take(m,1)` means take the mth number of each row; `a.take(m,0)` takes mth row.

In [52]:
serQ9 = pd.Series(np.take(list('abcdefgh'),np.random.randint(8,size=30)))

In [61]:
np.take(list('Han'),np.array([2,2,1,1,2,1,1,1]))

array(['n', 'n', 'a', 'a', 'n', 'a', 'a', 'a'], dtype='<U1')

## Keep only top n most frequent values as it is and replace everything else as ‘Other’
##### `ser.values `; `ser.index` return values and index of a series
#####  `isin()` can check if the value is in it

In [73]:
n=2
serQ10 = pd.Series(np.random.randint(1,10,[12]))
serQ10[~serQ10.isin(serQ10.value_counts().index[:n])] = "other"
serQ10[:5]

0        1
1    other
2        1
3        2
4        2
dtype: object

## Convert a numpy array to a dataframe of given shape

In [2]:
serQ12 = pd.Series(np.random.randint(1,10,35))
dfQ12 = pd.DataFrame(serQ12.values.reshape(5,7))
# change the column & index names
dfQ12 = pd.DataFrame(serQ12.values.reshape(5,7),index=['a','b','c','d','e'],columns=[10,20,30,40,50,60,70])


## Rename DataFrame & Series
#### `DataFrame.rename(columns = {'origina_column_name' = 'new_column_name'},inplace = True)`
#### Can be used to change name of columns of DataFrame

## Find the positions of numbers that are multiples of 3 from a series
#### `np.argwhere`

In [172]:
serQ13 = pd.Series(np.random.randint(1,10,7))
#####
# argwhere 认真看一下 & 应用到dataframe
#####

## Q14 extract items at given positions from a series
##### list('abcde') 和 ['abcde']的区别 —— 赋值到series后 前一个是每个字母单独分开的，后一个是一起的


In [16]:
# Q14 extract items at given positions from a series
serQ14 = pd.Series(list('abcdefg'))
serQ14_1 = pd.Series(['abcdefg'])
serQ14

0    a
1    b
2    c
3    d
4    e
5    f
6    g
dtype: object

## Q15 stack two series vertically and horizontally 
##### axis = 1 horizontally ; axis = 0 vertically
##### use pd.concat to do this for dataframe? 

In [5]:
ser1Q15 = pd.Series(range(5))
ser2Q15 = pd.Series(list('abcde'))

ser1Q15.append(ser2Q15)
df1Q15 = pd.concat([ser1Q15,ser2Q15],axis=1)

df2Q15 = pd.DataFrame({'col1':[1,2,3],'col2':['a','a','a']})
df3Q15 = pd.DataFrame({'col1':[4,5,6],'col2':['b','b','b']})
pd.concat([df2Q15,df3Q15],axis = 0)

Unnamed: 0,col1,col2
0,1,a
1,2,a
2,3,a
0,4,b
1,5,b
2,6,b


## Q16 get the positions of items of series A in another series B
##### pd.Index(series) 返回series的值
##### get_loc(i) 返回i的下标

In [39]:
# Q16 get the positions of items of series A in another series B
ser1Q16 = pd.Series([1,4,5,6,7,10])
ser2Q16 = pd.Series([4,5,10])


[pd.Index(ser1Q16).get_loc(i) for i in ser2Q16]


SyntaxError: invalid syntax (<ipython-input-39-f2c052b5e6a7>, line 5)

## Q17 compute the mean squared error on a truth and predicted series
##### **乘方

In [3]:
truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)

np.mean((truth - pred)**2)


0.2518839544044974

## Q18 convert the first character of each element in a series to uppercase
##### map & apply 区别
##### map （其实是python自带的）用于series上，是元素级别的操作
##### applymap 应用在dataframe的每个元素上
##### apply 应用在dataframe 一列上？

In [8]:
serQ18 = pd.Series(['how', 'to', 'kick', 'ass?'])

# solution 1
serQ18.map(lambda x:x.title())

# soluion 2
serQ18.map(lambda x:x[0].upper() + x[1:])

# solution 3
pd.Series([i.title() for i in serQ18])

0     How
1      To
2    Kick
3    Ass?
dtype: object

## Q19 calculate the number of characters in each word in a series


In [10]:
serQ19 = pd.Series(['how','to','my'])

serQ19.map(lambda x:len(x))

0    3
1    2
2    2
dtype: int64

## Q20 compute difference of differences between consequtive numbers of a series
###### diff _ a[n] - a[n-1]


In [13]:
serQ20 = pd.Series([3,5,6,9,10,12,15])

print(serQ20.diff().diff().tolist())

[nan, nan, -1.0, 2.0, -2.0, 1.0, 1.0]


## Q21 convert a series of date-strings to a timeseries


In [18]:
serQ21 = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

pd.to_datetime(serQ21)

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

## Q22 get the day of month, week number, day of year and day of week from a series of date strings


In [21]:
# 没做好
serQ22 = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

from dateutil.parser import parse
# parse 
ser_ts = serQ22.map(lambda x: parse(x))


## Q23 convert year-month string to dates corresponding to the 4th day of the month


In [24]:
serQ23 = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])

serQ23_ts = serQ23.map(lambda x: parse(x))

0   2010-01-02
1   2011-02-02
2   2012-03-02
dtype: datetime64[ns]

## Q24 filter words that contain atleast 2 vowels from a series
##### Counter 返回一个dictionary，每个字母有多少个；
##### get(key,value); 寻找一个字典里key对应的值，value是如果key不存在返回的值

In [33]:
serQ24 = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])

from collections import Counter
serQ24[serQ24.map(lambda x: sum(Counter(x.lower()).get(i,0) for i in list('aeiou')) >=2)]

0     Apple
1    Orange
4     Money
dtype: object

## Q25 filter valid emails from a series


In [26]:
emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com'])


NameError: name 'x' is not defined

## 26 get the mean of a series grouped by another series
##### np.random.choice(list,int) 从第一个list里随机选int个数
##### np.linspace(start, stop, num) 创建等差数列
##### tolist()  可将series转换成list


In [41]:
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))

weights = pd.Series(np.linspace(1, 10, 10))

print(weights.tolist())
print(fruit.tolist())

weights.groupby(fruit).max()

[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
['apple', 'apple', 'carrot', 'banana', 'carrot', 'banana', 'carrot', 'apple', 'carrot', 'carrot']


Index(['apple', 'banana', 'carrot'], dtype='object')

## Q27 compute the euclidean distance between two series
##### np.linalg.norm()  求范数


In [49]:
p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])
dist = sum((p-q)**2)**.5
dist = np.linalg.norm(p-q)

dist

18.16590212458495

# Q28 find all the local maxima (or peaks) in a numeric series
##### sign() 结果1，0，-1分别对应正数、0和负数
##### np.where(condition, x, y) —— 满足输出x，不满足输出y


In [52]:
serQ28 = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])
dd = np.diff(np.sign(np.diff(serQ28)))

peak_locs = np.where(dd == -2)[0] + 1

peak_locs

array([1, 5, 7])

## Q29 replace missing spaces in a string with the least frequent character

In [69]:
my_str = 'dbc deb abed gade'
serQ29 = pd.Series(list('dbc deb abed gade'))
freqQ29 = serQ29.value_counts()
least_freqQ29 = freqQ29.dropna().index[-1]
least_freqQ29

'g'

## 30 create a TimeSeries starting ‘2000-01-01’ and 10 weekends (saturdays) after that having random numbers as values


In [None]:
# 看不懂题
serQ30 = pd.Series(np.random.randint(1,10,10), pd.date_range('2000-01-01', periods=10, freq='W-SAT'))

## 31 fill an intermittent time series so all missing dates show up with values of previous non-missing date

In [None]:
# 31 fill an intermittent time series so all missing dates show up with values of previous non-missing date
# resample 用于重新采样？？？


## Q32 compute the autocorrelations of a numeric series
##### np.random.normal(loc, scale, size) —— loc:central for the distribution; scale:standard deviation


In [5]:
serQ32 = pd.Series(np.arange(20) + np.random.normal(1, 10, 20))


## Q33 How to import only every nth row from a csv file to create a dataframe?
##### 主要用chunksize

In [7]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', chunksize=50)
# solution 1
df2 = pd.DataFrame()
for chunk in df:
    df2 = df2.append(chunk.iloc[0,:])

## Q34 change column values when importing csv to a dataframe