# Manipulating Tabular Data with Pandas (Ch. 3)

In [2]:
import pandas as pd

## Pandas series

In [5]:
import pandas as pd

series_example = pd.Series([1,2,3,4,5])
series_example

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [45]:
import string

indices_example = list(string.ascii_lowercase[:5])
print(indices_example)

series_example.index = indices_example
series_example

['a', 'b', 'c', 'd', 'e']


a    1
b    2
c    3
d    4
e    5
dtype: int64

In [21]:
assert series_example['d'] == series_example.loc['d']
series_example['d']

4

In [34]:
print(series_example[:3])

a    1
b    2
c    3
dtype: int64


In [42]:
pd.date_range("20190803", periods=12, freq='W')

DatetimeIndex(['2019-08-04', '2019-08-11', '2019-08-18', '2019-08-25',
               '2019-09-01', '2019-09-08', '2019-09-15', '2019-09-22',
               '2019-09-29', '2019-10-06', '2019-10-13', '2019-10-20'],
              dtype='datetime64[ns]', freq='W-SUN')

In [62]:
import numpy as np

df_example = pd.DataFrame(np.random.randint(1,10,(6,6)), columns=list(string.ascii_uppercase[:6]))
df_example

Unnamed: 0,A,B,C,D,E,F
0,9,2,1,6,6,8
1,8,1,3,3,1,3
2,9,5,5,8,9,5
3,9,6,5,8,8,9
4,8,7,1,3,1,5
5,5,5,2,4,7,1


In [63]:
df_example.describe()

Unnamed: 0,A,B,C,D,E,F
count,6.0,6.0,6.0,6.0,6.0,6.0
mean,8.0,4.333333,2.833333,5.333333,5.333333,5.166667
std,1.549193,2.33809,1.834848,2.33809,3.50238,2.994439
min,5.0,1.0,1.0,3.0,1.0,1.0
25%,8.0,2.75,1.25,3.25,2.25,3.5
50%,8.5,5.0,2.5,5.0,6.5,5.0
75%,9.0,5.75,4.5,7.5,7.75,7.25
max,9.0,7.0,5.0,8.0,9.0,9.0


In [64]:
df_example.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 6 columns):
A    6 non-null int64
B    6 non-null int64
C    6 non-null int64
D    6 non-null int64
E    6 non-null int64
F    6 non-null int64
dtypes: int64(6)
memory usage: 416.0 bytes


In [65]:
df_example[['A', 'C']]

Unnamed: 0,A,C
0,9,1
1,8,3
2,9,5
3,9,5
4,8,1
5,5,2


In [69]:
df_example[1:3]

Unnamed: 0,A,B,C,D,E,F
1,8,1,3,3,1,3
2,9,5,5,8,9,5


In [71]:
df_example.at[1,'C']

3

In [126]:
random_indices_1 = np.random.randint(0,2,len(df_example)).astype(bool)
random_indices_2 = np.random.randint(0,2,len(df_example)).astype(bool)

In [127]:
df_example[random_indices_1]

Unnamed: 0,A,B,C,D,E,F
1,8,1,3,3,1,3
3,9,6,5,8,8,9
4,8,7,1,3,1,5


In [128]:
df_example[random_indices_2]

Unnamed: 0,A,B,C,D,E,F
0,9,2,1,6,6,8
3,9,6,5,8,8,9
4,8,7,1,3,1,5


In [129]:
df_example[random_indices_1 & random_indices_2]

Unnamed: 0,A,B,C,D,E,F
3,9,6,5,8,8,9
4,8,7,1,3,1,5


In [131]:
df_example.T

Unnamed: 0,0,1,2,3,4,5
A,9,8,9,9,8,5
B,2,1,5,6,7,5
C,1,3,5,5,1,2
D,6,3,8,8,3,4
E,6,1,9,8,1,7
F,8,3,5,9,5,1


In [141]:
df_modified = df_example.copy()
df_modified['B'] = df_example['B'].apply(lambda x: x**2)
print(df_example, '\n', '*'*20, '\n')
print(df_modified)

   A  B  C  D  E  F
0  9  2  1  6  6  8
1  8  1  3  3  1  3
2  9  5  5  8  9  5
3  9  6  5  8  8  9
4  8  7  1  3  1  5
5  5  5  2  4  7  1 
 ******************** 

   A   B  C  D  E  F
0  9   4  1  6  6  8
1  8   1  3  3  1  3
2  9  25  5  8  9  5
3  9  36  5  8  8  9
4  8  49  1  3  1  5
5  5  25  2  4  7  1


In [165]:
# Adding columns
df_added = df_example.copy()
df_added['G'] = [0]*len(df_added)
df_added

Unnamed: 0,A,B,C,D,E,F,G
0,9,2,1,6,6,8,0
1,8,1,3,3,1,3,0
2,9,5,5,8,9,5,0
3,9,6,5,8,8,9,0
4,8,7,1,3,1,5,0
5,5,5,2,4,7,1,0


In [197]:
# Deleting rows
keeping_columns = list(df_added.columns)
keeping_columns.remove('D')
df_added.loc[:,keeping_columns]

Unnamed: 0,A,B,C,E,F,G
0,9,2,1,6,8,0
1,8,1,3,1,3,0
2,9,5,5,9,5,0
3,9,6,5,8,9,0
4,8,7,1,1,5,0
5,5,5,2,7,1,0


In [207]:
df_added.drop(['A'], axis=1)

Unnamed: 0,B,C,D,E,F,G
0,2,1,6,6,8,0
1,1,3,3,1,3,0
2,5,5,8,9,5,0
3,6,5,8,8,9,0
4,7,1,3,1,5,0
5,5,2,4,7,1,0


In [237]:
df = pd.DataFrame({'Gender': ['Male','Male','Female','Female','Male'],
                   'Team':np.random.randint(0,2,5),
                   'Height':np.random.random(5)})
df

Unnamed: 0,Gender,Team,Height
0,Male,0,0.755237
1,Male,0,0.88449
2,Female,1,0.319572
3,Female,0,0.384456
4,Male,0,0.065756


In [238]:
pd.crosstab(df.Gender, df.Team)

Team,0,1
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,1,1
Male,3,0


In [243]:
test = df.groupby('Team')

for key,item in test:
    print(key, '\n\n', item, '\n', '*'*20)

0 

    Gender  Team    Height
0    Male     0  0.755237
1    Male     0  0.884490
3  Female     0  0.384456
4    Male     0  0.065756 
 ********************
1 

    Gender  Team    Height
2  Female     1  0.319572 
 ********************
