In [1]:
import numpy as np
import pandas as pd

In [2]:
header = ['chrom', 'pos', 'filter']
data = [4, 12345, 38.4]

In [3]:
vector = np.array(data)
print(vector)

[4.0000e+00 1.2345e+04 3.8400e+01]


In [4]:
data_dict = {'chrom': 4, 'pos': 12345, 'filter': 38.4}

In [5]:
s1 = pd.Series(data=data)

In [6]:
s1

0        4.0
1    12345.0
2       38.4
dtype: float64

In [8]:
s1 > 10

0    False
1     True
2     True
dtype: bool

In [12]:
s1[1]

12345.0

In [9]:
s2 = pd.Series(data=data, index=header)

In [10]:
s2

chrom         4.0
pos       12345.0
filter       38.4
dtype: float64

In [11]:
s2['pos']

12345.0

In [13]:
data = [1, '2s', 34] 

In [14]:
pd.Series(data)

0     1
1    2s
2    34
dtype: object

In [15]:
pd.Series(data_dict)

chrom         4.0
pos       12345.0
filter       38.4
dtype: float64

In [16]:
header1 = ['chrom', 'pos', 'filter']
data1 = [4, 12345, 38.4]
header2 = ['chrom', 'pos', 'filter', 'qual']
data2 = [3, 4899, 234, 89.9]

s1 = pd.Series(data1, header1)
s2 = pd.Series(data2, header2)

In [17]:
s1+s2 

chrom         7.0
filter      272.4
pos       17244.0
qual          NaN
dtype: float64

In [18]:
header1 = ['chrom', 'pos', 'filter']
data1 = [4, 12345, 38.4]
header2 = ['chrom', 'pos', 'filter', 'qual']
data2 = [3, 4899, '234', 89.9]

s1 = pd.Series(data1, header1)
s2 = pd.Series(data2, header2)

In [19]:
s1 + s2

TypeError: unsupported operand type(s) for +: 'float' and 'str'

In [20]:
data1 = [4, 12345, 38.4]
data2 = [3, 4899, 234, 89.9]

s1 = pd.Series(data1)
s2 = pd.Series(data2)

s1 + s2

0        7.0
1    17244.0
2      272.4
3        NaN
dtype: float64

In [21]:
## IMPORTANT - with index labels -- operations are based on label
header1 = ['pos', 'filter', 'chrom']
data1 = [12345, 38.4, 4]
header2 = ['chrom', 'pos', 'filter', 'qual']
data2 = [3, 4899, 234, 89.9]

s1 = pd.Series(data1, header1)
s2 = pd.Series(data2, header2)
s1+s2 

chrom         7.0
filter      272.4
pos       17244.0
qual          NaN
dtype: float64

In [22]:
data1 = [12345, 38.4, 4]
data2 = [3, 4899, 234, 89.9]

s1 = pd.Series(data1)
s2 = pd.Series(data2)
s1 + s2

0    12348.0
1     4937.4
2      238.0
3        NaN
dtype: float64

In [23]:
header = ['exam1', 'exam2', 'exam3']
data = np.random.randint(65, 101, 12).reshape(4,3)
students = ['student1', 'student2', 'student3', 'student4']

In [25]:

df = pd.DataFrame(data=data, columns=header)
print(df)
display(df)

   exam1  exam2  exam3
0     72     65     68
1     71     77     68
2     80     68     79
3     97     70     68


Unnamed: 0,exam1,exam2,exam3
0,72,65,68
1,71,77,68
2,80,68,79
3,97,70,68


In [26]:
df = pd.DataFrame(data=data, index=students, columns=header)
display(df)

Unnamed: 0,exam1,exam2,exam3
student1,72,65,68
student2,71,77,68
student3,80,68,79
student4,97,70,68


In [27]:
df['exam1']

student1    72
student2    71
student3    80
student4    97
Name: exam1, dtype: int32

In [28]:
df.exam1 # not a good way to do this

student1    72
student2    71
student3    80
student4    97
Name: exam1, dtype: int32

In [29]:
select_column = 'exam3'
df[select_column]

student1    68
student2    68
student3    79
student4    68
Name: exam3, dtype: int32

In [30]:
df['average'] = (df['exam1'] + df['exam2'] + df['exam3'])/3 

In [31]:
display(df)

Unnamed: 0,exam1,exam2,exam3,average
student1,72,65,68,68.333333
student2,71,77,68,72.0
student3,80,68,79,75.666667
student4,97,70,68,78.333333


In [32]:
df.drop('average') # does not work because default for drop is to work on row labels

KeyError: "['average'] not found in axis"

In [33]:
df.drop('average', axis=1) # works on column labels 

Unnamed: 0,exam1,exam2,exam3
student1,72,65,68
student2,71,77,68
student3,80,68,79
student4,97,70,68


In [34]:
display(df)

Unnamed: 0,exam1,exam2,exam3,average
student1,72,65,68,68.333333
student2,71,77,68,72.0
student3,80,68,79,75.666667
student4,97,70,68,78.333333


In [35]:
df.drop('average', axis=1, inplace=True)

In [36]:
display(df)

Unnamed: 0,exam1,exam2,exam3
student1,72,65,68
student2,71,77,68
student3,80,68,79
student4,97,70,68


In [37]:
df.drop('student3')

Unnamed: 0,exam1,exam2,exam3
student1,72,65,68
student2,71,77,68
student4,97,70,68


In [39]:
df.shape

(4, 3)

In [41]:
display(df)

Unnamed: 0,exam1,exam2,exam3
student1,72,65,68
student2,71,77,68
student3,80,68,79
student4,97,70,68


In [42]:
header = ['exam1', 'exam2', 'exam3']
data = np.random.randint(65, 101, 12).reshape(4,3)
students = ['student1', 'student2', 'student3', 'student4']
df = pd.DataFrame(data=data, index=students, columns=header)

In [43]:
df['exam1']

student1    83
student2    69
student3    87
student4    94
Name: exam1, dtype: int32

In [44]:
df.loc['student1']

exam1    83
exam2    82
exam3    71
Name: student1, dtype: int32

In [45]:
df.iloc[0] ## remember that column names do not count as rows

exam1    83
exam2    82
exam3    71
Name: student1, dtype: int32

In [46]:
header = ['exam1', 'exam2', 'exam3']
data = np.random.randint(65, 101, 12).reshape(4,3)
students = ['student1', 'student2', 'student3', 'student4']
df = pd.DataFrame(data=data, columns=header)

In [47]:
display(df)

Unnamed: 0,exam1,exam2,exam3
0,71,66,76
1,65,72,78
2,95,71,84
3,83,94,70


In [48]:
df.loc['student1']

KeyError: 'student1'

In [49]:
df.iloc[0]

exam1    71
exam2    66
exam3    76
Name: 0, dtype: int32

In [50]:
header = ['exam1', 'exam2', 'exam3']
data = np.random.randint(65, 101, 12).reshape(4,3)
students = ['student1', 'student2', 'student3', 'student4']
df = pd.DataFrame(data=data, index=students, columns=header)

In [51]:
display(df)

Unnamed: 0,exam1,exam2,exam3
student1,75,86,77
student2,88,80,65
student3,71,73,75
student4,83,65,79


In [52]:
df.loc['student1', 'exam1']

75

In [53]:
df.loc[['student1', 'student3'], ['exam1', 'exam3']]

Unnamed: 0,exam1,exam3
student1,75,77
student3,71,75


In [54]:
header = ['exam1', 'exam2', 'exam3']
data = np.random.randint(65, 101, 12).reshape(4,3)
students = ['student1', 'student2', 'student3', 'student4']
df = pd.DataFrame(data=data, index=students, columns=header)

In [55]:
df>=90

Unnamed: 0,exam1,exam2,exam3
student1,False,False,False
student2,False,False,True
student3,True,False,True
student4,True,False,True


In [56]:
df[df>=90]

Unnamed: 0,exam1,exam2,exam3
student1,,,
student2,,,92.0
student3,91.0,,95.0
student4,94.0,,96.0


In [57]:
df['exam1']>=85

student1    False
student2     True
student3     True
student4     True
Name: exam1, dtype: bool

In [58]:
df[df['exam1']>=85] # gives all columns where exam1 is greater than 85

Unnamed: 0,exam1,exam2,exam3
student2,89,80,92
student3,91,80,95
student4,94,87,96


In [59]:
df[df['exam1']>=85]['exam3'] 

student2    92
student3    95
student4    96
Name: exam3, dtype: int32

In [60]:
df[df['exam1']>=85][['exam2', 'exam3']] 

Unnamed: 0,exam2,exam3
student2,80,92
student3,80,95
student4,87,96


In [61]:
df[(df['exam1']>=85) & (df['exam2']>=85)]

Unnamed: 0,exam1,exam2,exam3
student4,94,87,96


In [62]:
df[(df['exam1']>=85) & (df['exam2']>=85)]['exam3']

student4    96
Name: exam3, dtype: int32

In [63]:
df[(df['exam1']>=85) | (df['exam2']>=85)]

Unnamed: 0,exam1,exam2,exam3
student2,89,80,92
student3,91,80,95
student4,94,87,96


In [64]:
df[(df['exam1']>=85) | (df['exam2']>=85)]['exam3']

student2    92
student3    95
student4    96
Name: exam3, dtype: int32

In [66]:
df[(df['exam1']>=85) | (df['exam2']>=85)]['exam3']

student2    92
student3    95
student4    96
Name: exam3, dtype: int32

In [None]:
# & - and, | - or

In [67]:
header = ['exam1', 'exam2', 'exam3']
data = np.random.randint(65, 101, 12).reshape(4,3)
students = ['student1', 'student2', 'student3', 'student4']
df = pd.DataFrame(data=data, columns=header)

In [68]:
display(df)

Unnamed: 0,exam1,exam2,exam3
0,87,72,71
1,91,84,97
2,89,74,81
3,89,72,89


In [82]:
df['name'] = students

In [83]:
display(df)

Unnamed: 0_level_0,exam1,exam2,exam3,name
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
student1,87,72,71,student1
student2,91,84,97,student2
student3,89,74,81,student3
student4,89,72,89,student4


In [84]:
df.set_index('name', inplace=True)

In [85]:
df.iloc[0]

exam1    87
exam2    72
exam3    71
Name: student1, dtype: int32

In [86]:
df.iloc[0]['exam1']
df.iloc[0][0]
df.iloc[0,0]

87

In [87]:
display(df)

Unnamed: 0_level_0,exam1,exam2,exam3
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
student1,87,72,71
student2,91,84,97
student3,89,74,81
student4,89,72,89


In [88]:
df.loc['student1']

exam1    87
exam2    72
exam3    71
Name: student1, dtype: int32

In [89]:
df.reset_index(inplace=True)

In [91]:
display(df)

Unnamed: 0,name,exam1,exam2,exam3
0,student1,87,72,71
1,student2,91,84,97
2,student3,89,74,81
3,student4,89,72,89


In [92]:
df.loc['student1']

KeyError: 'student1'

In [93]:
students = 'student1 student1 student1 student2 student2 student2 student3 student3 student3'

In [94]:
exams = 'exam1 exam2 exam3'.split()*3

In [95]:
print(exams)

['exam1', 'exam2', 'exam3', 'exam1', 'exam2', 'exam3', 'exam1', 'exam2', 'exam3']


In [96]:
classes = 'class1 class2'

In [97]:
index = list(zip(students.split(), exams))

In [98]:
print(index)

[('student1', 'exam1'), ('student1', 'exam2'), ('student1', 'exam3'), ('student2', 'exam1'), ('student2', 'exam2'), ('student2', 'exam3'), ('student3', 'exam1'), ('student3', 'exam2'), ('student3', 'exam3')]


In [99]:
index = pd.MultiIndex.from_tuples(index)

In [100]:
display(index)

MultiIndex([('student1', 'exam1'),
            ('student1', 'exam2'),
            ('student1', 'exam3'),
            ('student2', 'exam1'),
            ('student2', 'exam2'),
            ('student2', 'exam3'),
            ('student3', 'exam1'),
            ('student3', 'exam2'),
            ('student3', 'exam3')],
           )

In [101]:
df = pd.DataFrame(np.random.randint(65, 101, 3*3*2).reshape(9,2) , index, classes.split())

In [102]:
display(df)

Unnamed: 0,Unnamed: 1,class1,class2
student1,exam1,93,91
student1,exam2,70,74
student1,exam3,72,86
student2,exam1,73,75
student2,exam2,80,96
student2,exam3,67,96
student3,exam1,75,77
student3,exam2,99,69
student3,exam3,75,75


In [103]:
df.loc['student1'].loc['exam1']['class1'] 

93

In [104]:
df.index.names

FrozenList([None, None])

In [105]:
df.index.names = ['Students', 'Exams']

In [106]:
display(df)

Unnamed: 0_level_0,Unnamed: 1_level_0,class1,class2
Students,Exams,Unnamed: 2_level_1,Unnamed: 3_level_1
student1,exam1,93,91
student1,exam2,70,74
student1,exam3,72,86
student2,exam1,73,75
student2,exam2,80,96
student2,exam3,67,96
student3,exam1,75,77
student3,exam2,99,69
student3,exam3,75,75


In [108]:
df[df<70]

Unnamed: 0_level_0,Unnamed: 1_level_0,class1,class2
Students,Exams,Unnamed: 2_level_1,Unnamed: 3_level_1
student1,exam1,,
student1,exam2,,
student1,exam3,,
student2,exam1,,
student2,exam2,,
student2,exam3,67.0,
student3,exam1,,
student3,exam2,,69.0
student3,exam3,,


In [109]:
df.xs('student1')

Unnamed: 0_level_0,class1,class2
Exams,Unnamed: 1_level_1,Unnamed: 2_level_1
exam1,93,91
exam2,70,74
exam3,72,86


In [110]:
df.xs('exam1', level='Exams')

Unnamed: 0_level_0,class1,class2
Students,Unnamed: 1_level_1,Unnamed: 2_level_1
student1,93,91
student2,73,75
student3,75,77


In [113]:
df.xs('exam3', level='Exams')

Unnamed: 0_level_0,class1,class2
Students,Unnamed: 1_level_1,Unnamed: 2_level_1
student1,72,86
student2,67,96
student3,75,75


In [114]:
my_dict = {'student1': [90, 84, np.nan], 'student2': [77, np.nan, np.nan], 'student3': [88, 65, 93]}

In [115]:
df = pd.DataFrame(my_dict)

In [116]:
display(df)

Unnamed: 0,student1,student2,student3
0,90.0,77.0,88
1,84.0,,65
2,,,93


In [117]:
df.dropna()

Unnamed: 0,student1,student2,student3
0,90.0,77.0,88


In [118]:
df.dropna(axis=0)

Unnamed: 0,student1,student2,student3
0,90.0,77.0,88


In [119]:
df.dropna(axis=1)

Unnamed: 0,student3
0,88
1,65
2,93


In [120]:
df.dropna(thresh=2)

Unnamed: 0,student1,student2,student3
0,90.0,77.0,88
1,84.0,,65


In [121]:
df.fillna(value=55)

Unnamed: 0,student1,student2,student3
0,90.0,77.0,88
1,84.0,55.0,65
2,55.0,55.0,93


In [127]:
df.drop(axis=0, labels=[0, 1]) 

Unnamed: 0,student1,student2,student3
2,,,93


In [123]:
df.drop(axis=1, columns=['student1']) 

Unnamed: 0,student2,student3
0,77.0,88
1,,65
2,,93


In [128]:
my_dict = {
    'Exams': 'exam1 exam1 exam1'.split() + 'exam2 exam2 exam2'.split() + 'exam3 exam3 exam3'.split(),
    'Students': 'student1 student2 student3'.split()*3,   
    'Scores': np.random.randint(65,101,9)
}

print(my_dict)

{'Exams': ['exam1', 'exam1', 'exam1', 'exam2', 'exam2', 'exam2', 'exam3', 'exam3', 'exam3'], 'Students': ['student1', 'student2', 'student3', 'student1', 'student2', 'student3', 'student1', 'student2', 'student3'], 'Scores': array([ 75,  87,  69,  70,  91,  99, 100,  76,  93])}


In [129]:
df = pd.DataFrame(my_dict)

In [130]:
display(df)

Unnamed: 0,Exams,Students,Scores
0,exam1,student1,75
1,exam1,student2,87
2,exam1,student3,69
3,exam2,student1,70
4,exam2,student2,91
5,exam2,student3,99
6,exam3,student1,100
7,exam3,student2,76
8,exam3,student3,93


In [131]:
df.groupby('Students').mean()

Unnamed: 0_level_0,Scores
Students,Unnamed: 1_level_1
student1,81.666667
student2,84.666667
student3,87.0


In [132]:
df.groupby('Students').mean().loc['student1']

Scores    81.666667
Name: student1, dtype: float64

In [135]:
df.groupby('Exams').max()['Scores']

Exams
exam1     87
exam2     99
exam3    100
Name: Scores, dtype: int32

In [136]:
df.groupby('Exams').describe() 

Unnamed: 0_level_0,Scores,Scores,Scores,Scores,Scores,Scores,Scores,Scores
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Exams,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
exam1,3.0,77.0,9.165151,69.0,72.0,75.0,81.0,87.0
exam2,3.0,86.666667,14.977761,70.0,80.5,91.0,95.0,99.0
exam3,3.0,89.666667,12.342339,76.0,84.5,93.0,96.5,100.0


In [137]:
df.groupby('Exams').groupby('Students')

AttributeError: 'DataFrameGroupBy' object has no attribute 'groupby'

In [138]:
df.groupby('Students').describe().transpose() 

Unnamed: 0,Students,student1,student2,student3
Scores,count,3.0,3.0,3.0
Scores,mean,81.666667,84.666667,87.0
Scores,std,16.072751,7.767453,15.874508
Scores,min,70.0,76.0,69.0
Scores,25%,72.5,81.5,81.0
Scores,50%,75.0,87.0,93.0
Scores,75%,87.5,89.0,96.0
Scores,max,100.0,91.0,99.0


In [139]:
departments = { 
    'DepartmentId': [1, 2, 3, 4],
    'DepartmentName': ['IT', 'Physics', 'Arts', 'Math'] 
}

df1 = pd.DataFrame(departments)

In [141]:
display(df1)

Unnamed: 0,DepartmentId,DepartmentName
0,1,IT
1,2,Physics
2,3,Arts
3,4,Math


In [142]:
students = {
    'StudentId': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'StudentName': ['Michael', 'John', 'Jack', 'Sara', 'Sally', 'Jena', 'Nancy', 'Adam', 'Stevens', 'George'],
    'DepartmentId': [1, 1, 1, 2, 2, np.nan, 2, 3, 3, np.nan]
}

df2 = pd.DataFrame(students)


In [143]:
display(df2)

Unnamed: 0,StudentId,StudentName,DepartmentId
0,1,Michael,1.0
1,2,John,1.0
2,3,Jack,1.0
3,4,Sara,2.0
4,5,Sally,2.0
5,6,Jena,
6,7,Nancy,2.0
7,8,Adam,3.0
8,9,Stevens,3.0
9,10,George,


In [144]:
marks = {
    'MarkId': [1, 2, 3, 4, 5, 6, 7, 8],
    'StudentId': [1, 2, 3, 4, 5, 6, 7, 8], 
    'Mark': [18, 20, 16, 19, 14, 20, 20, 20]
}

df3 = pd.DataFrame(marks)

In [145]:
display(df3)

Unnamed: 0,MarkId,StudentId,Mark
0,1,1,18
1,2,2,20
2,3,3,16
3,4,4,19
4,5,5,14
5,6,6,20
6,7,7,20
7,8,8,20


In [148]:
pd.merge(df2, df1, how='inner', on='DepartmentId')

Unnamed: 0,StudentId,StudentName,DepartmentId,DepartmentName
0,1,Michael,1.0,IT
1,2,John,1.0,IT
2,3,Jack,1.0,IT
3,4,Sara,2.0,Physics
4,5,Sally,2.0,Physics
5,7,Nancy,2.0,Physics
6,8,Adam,3.0,Arts
7,9,Stevens,3.0,Arts


In [150]:
pd.merge(df1, df2, how='inner', on='DepartmentId')

Unnamed: 0,DepartmentId,DepartmentName,StudentId,StudentName
0,1,IT,1,Michael
1,1,IT,2,John
2,1,IT,3,Jack
3,2,Physics,4,Sara
4,2,Physics,5,Sally
5,2,Physics,7,Nancy
6,3,Arts,8,Adam
7,3,Arts,9,Stevens


In [153]:
pd.merge(df1, df2, how='outer', on='DepartmentId')

Unnamed: 0,DepartmentId,DepartmentName,StudentId,StudentName
0,1.0,IT,1.0,Michael
1,1.0,IT,2.0,John
2,1.0,IT,3.0,Jack
3,2.0,Physics,4.0,Sara
4,2.0,Physics,5.0,Sally
5,2.0,Physics,7.0,Nancy
6,3.0,Arts,8.0,Adam
7,3.0,Arts,9.0,Stevens
8,4.0,Math,,
9,,,6.0,Jena


In [154]:
pd.merge(df2, df1, how='right', on='DepartmentId') 

Unnamed: 0,StudentId,StudentName,DepartmentId,DepartmentName
0,1.0,Michael,1.0,IT
1,2.0,John,1.0,IT
2,3.0,Jack,1.0,IT
3,4.0,Sara,2.0,Physics
4,5.0,Sally,2.0,Physics
5,7.0,Nancy,2.0,Physics
6,8.0,Adam,3.0,Arts
7,9.0,Stevens,3.0,Arts
8,,,4.0,Math


In [155]:
pd.merge(df3, pd.merge(df2, df1, how='inner', on='DepartmentId'), how='inner', on='StudentId')

Unnamed: 0,MarkId,StudentId,Mark,StudentName,DepartmentId,DepartmentName
0,1,1,18,Michael,1.0,IT
1,2,2,20,John,1.0,IT
2,3,3,16,Jack,1.0,IT
3,4,4,19,Sara,2.0,Physics
4,5,5,14,Sally,2.0,Physics
5,7,7,20,Nancy,2.0,Physics
6,8,8,20,Adam,3.0,Arts


In [156]:
data = pd.merge(df3, pd.merge(df2, df1, how='inner', on='DepartmentId'), how='inner', on='StudentId')

In [157]:
display(data)

Unnamed: 0,MarkId,StudentId,Mark,StudentName,DepartmentId,DepartmentName
0,1,1,18,Michael,1.0,IT
1,2,2,20,John,1.0,IT
2,3,3,16,Jack,1.0,IT
3,4,4,19,Sara,2.0,Physics
4,5,5,14,Sally,2.0,Physics
5,7,7,20,Nancy,2.0,Physics
6,8,8,20,Adam,3.0,Arts


In [158]:
data[['StudentName', 'Mark', 'DepartmentName']] 

Unnamed: 0,StudentName,Mark,DepartmentName
0,Michael,18,IT
1,John,20,IT
2,Jack,16,IT
3,Sara,19,Physics
4,Sally,14,Physics
5,Nancy,20,Physics
6,Adam,20,Arts


In [172]:
d1 = {
    'C0': ['C0R0', 'C0R1', 'C0R2'],
    'C1': ['C1R0', 'C1R1', 'C2R2'],
    'C2': ['C2R0', 'C2R1', 'C2R2'],
}

df1 = pd.DataFrame(d1)

In [173]:
display(df1)

Unnamed: 0,C0,C1,C2
0,C0R0,C1R0,C2R0
1,C0R1,C1R1,C2R1
2,C0R2,C2R2,C2R2


In [174]:
d2 = {
    'C0': ['C0R3', 'C0R4', 'C0R5'],
    'C1': ['C1R3', 'C1R4', 'C1R5'],
    'C2': ['C2R3', 'C2R4', 'C2R5'],
}

df2 = pd.DataFrame(d2)
display(df2)

Unnamed: 0,C0,C1,C2
0,C0R3,C1R3,C2R3
1,C0R4,C1R4,C2R4
2,C0R5,C1R5,C2R5


In [175]:
d3 = {
    'C0': ['C0R6', 'C0R7', 'C0R8'],
    'C1': ['C1R6', 'C1R7', 'C1R8'],
    'C2': ['C2R6', 'C2R7', 'C2R8'],
}

df3 = pd.DataFrame(d3)

df = pd.concat([df1, df2, df3])
display(df)

Unnamed: 0,C0,C1,C2
0,C0R0,C1R0,C2R0
1,C0R1,C1R1,C2R1
2,C0R2,C2R2,C2R2
0,C0R3,C1R3,C2R3
1,C0R4,C1R4,C2R4
2,C0R5,C1R5,C2R5
0,C0R6,C1R6,C2R6
1,C0R7,C1R7,C2R7
2,C0R8,C1R8,C2R8


In [176]:
df.iloc[8]

C0    C0R8
C1    C1R8
C2    C2R8
Name: 2, dtype: object

In [170]:
## Concatenation -- Fix index

d1 = {
    'C0': ['COR0', 'COR1', 'COR2'],
    'C1': ['C1R0', 'C1R1', 'C2R2'],
    'C2': ['C2R0', 'C2R1', 'C2R2'],
}

df1 = pd.DataFrame(d1, index=[1, 2, 3])

d2 = {
    'C0': ['C0R3', 'C0R4', 'C0R5'],
    'C1': ['C1R3', 'C1R4', 'C1R5'],
    'C2': ['C2R3', 'C2R4', 'C2R5'],
}

df2 = pd.DataFrame(d2, index=[4, 5, 6])

d3 = {
    'C0': ['C0R6', 'C0R7', 'C0R8'],
    'C1': ['C1R6', 'C1R7', 'C1R8'],
    'C2': ['C2R6', 'C2R7', 'C2R8'],
}

df3 = pd.DataFrame(d3, index=[7, 8, 9])

pd.concat([df1, df2, df3])

Unnamed: 0,C0,C1,C2
1,COR0,C1R0,C2R0
2,COR1,C1R1,C2R1
3,COR2,C2R2,C2R2
4,C0R3,C1R3,C2R3
5,C0R4,C1R4,C2R4
6,C0R5,C1R5,C2R5
7,C0R6,C1R6,C2R6
8,C0R7,C1R7,C2R7
9,C0R8,C1R8,C2R8


In [177]:
pd.concat([df1, df2, df3], axis=1)

Unnamed: 0,C0,C1,C2,C0.1,C1.1,C2.1,C0.2,C1.2,C2.2
0,C0R0,C1R0,C2R0,C0R3,C1R3,C2R3,C0R6,C1R6,C2R6
1,C0R1,C1R1,C2R1,C0R4,C1R4,C2R4,C0R7,C1R7,C2R7
2,C0R2,C2R2,C2R2,C0R5,C1R5,C2R5,C0R8,C1R8,C2R8


In [178]:
data['DepartmentName']

0         IT
1         IT
2         IT
3    Physics
4    Physics
5    Physics
6       Arts
Name: DepartmentName, dtype: object

In [179]:
data['DepartmentName'].unique() # SELECT DISTINCT DepartmentName From Departments

array(['IT', 'Physics', 'Arts'], dtype=object)

In [180]:
data['DepartmentName'].nunique()  # SELECT DISTINCT count(DepartmentName) From Departments

3

In [181]:
data['DepartmentName'].value_counts() 
# SELECT DepartmentName, count(DepartmentName) From Departments GROUP BY DepartmentName

Physics    3
IT         3
Arts       1
Name: DepartmentName, dtype: int64

In [182]:
data[data['Mark']>17]  

Unnamed: 0,MarkId,StudentId,Mark,StudentName,DepartmentId,DepartmentName
0,1,1,18,Michael,1.0,IT
1,2,2,20,John,1.0,IT
3,4,4,19,Sara,2.0,Physics
5,7,7,20,Nancy,2.0,Physics
6,8,8,20,Adam,3.0,Arts


In [183]:
data['Mark']

0    18
1    20
2    16
3    19
4    14
5    20
6    20
Name: Mark, dtype: int64

In [184]:
data['Mark']*5


0     90
1    100
2     80
3     95
4     70
5    100
6    100
Name: Mark, dtype: int64

In [185]:
def times5(val):
    return val * 5

data['Mark'].apply(times5)

0     90
1    100
2     80
3     95
4     70
5    100
6    100
Name: Mark, dtype: int64

In [186]:
data['Mark'].apply(lambda val: val*5)

0     90
1    100
2     80
3     95
4     70
5    100
6    100
Name: Mark, dtype: int64

In [187]:
data['Mark'].apply(lambda val: f'Grade: {val*5}')

0     Grade: 90
1    Grade: 100
2     Grade: 80
3     Grade: 95
4     Grade: 70
5    Grade: 100
6    Grade: 100
Name: Mark, dtype: object

In [188]:
def upper(string):
    return string.upper()

data['DepartmentName'].apply(upper)

0         IT
1         IT
2         IT
3    PHYSICS
4    PHYSICS
5    PHYSICS
6       ARTS
Name: DepartmentName, dtype: object

In [189]:
data['DepartmentName'].apply(lambda string: string.upper())

0         IT
1         IT
2         IT
3    PHYSICS
4    PHYSICS
5    PHYSICS
6       ARTS
Name: DepartmentName, dtype: object

In [190]:
mapping = {18: 'B', 14: 'C', 19: 'A-', 20: 'A+'}

In [193]:
display(data['Mark'])
data['Mark'].map(mapping)  

0    18
1    20
2    16
3    19
4    14
5    20
6    20
Name: Mark, dtype: int64

0      B
1     A+
2    NaN
3     A-
4      C
5     A+
6     A+
Name: Mark, dtype: object

In [194]:
data.columns

Index(['MarkId', 'StudentId', 'Mark', 'StudentName', 'DepartmentId',
       'DepartmentName'],
      dtype='object')

In [196]:
data.drop(['StudentId', 'MarkId' , 'DepartmentId'], axis=1)[['StudentName', 'Mark', 'DepartmentName']]

Unnamed: 0,StudentName,Mark,DepartmentName
0,Michael,18,IT
1,John,20,IT
2,Jack,16,IT
3,Sara,19,Physics
4,Sally,14,Physics
5,Nancy,20,Physics
6,Adam,20,Arts


In [197]:
display(data)

Unnamed: 0,MarkId,StudentId,Mark,StudentName,DepartmentId,DepartmentName
0,1,1,18,Michael,1.0,IT
1,2,2,20,John,1.0,IT
2,3,3,16,Jack,1.0,IT
3,4,4,19,Sara,2.0,Physics
4,5,5,14,Sally,2.0,Physics
5,7,7,20,Nancy,2.0,Physics
6,8,8,20,Adam,3.0,Arts


In [198]:
data.sort_values('Mark') # SELECT * FROM Data ORDER BY Mark

Unnamed: 0,MarkId,StudentId,Mark,StudentName,DepartmentId,DepartmentName
4,5,5,14,Sally,2.0,Physics
2,3,3,16,Jack,1.0,IT
0,1,1,18,Michael,1.0,IT
3,4,4,19,Sara,2.0,Physics
1,2,2,20,John,1.0,IT
5,7,7,20,Nancy,2.0,Physics
6,8,8,20,Adam,3.0,Arts


In [199]:
data.sort_values('Mark', ascending=False) # SELECT * FROM Data ORDER BY Mark desc

Unnamed: 0,MarkId,StudentId,Mark,StudentName,DepartmentId,DepartmentName
1,2,2,20,John,1.0,IT
5,7,7,20,Nancy,2.0,Physics
6,8,8,20,Adam,3.0,Arts
3,4,4,19,Sara,2.0,Physics
0,1,1,18,Michael,1.0,IT
2,3,3,16,Jack,1.0,IT
4,5,5,14,Sally,2.0,Physics


In [200]:
data = pd.read_csv('students.tsv', sep='\t', names=['lastname', 'firstname', 'username', 'exam1', 'exam2', 'exam3']) 

In [201]:
display(data)

Unnamed: 0,lastname,firstname,username,exam1,exam2,exam3
0,Larson,Melissa,larsonmeli,24,89,31
1,Novak,Melissa,novakmeli,77,3,38
2,Sawyer,Wendy,sawyerwend,81,59,30
3,Schultz,Marcus,schultzmarc,59,29,67
4,Vega,Nicholas,veganich,64,12,95
...,...,...,...,...,...,...
95,Duncan,Andrew,duncanandr,59,87,34
96,Bush,Alexander,bushalex,82,54,25
97,Summers,Michael,summersmich,18,95,63
98,Bowers,Sandra,bowerssand,52,8,50


In [202]:
data.sort_values('exam1', ascending=False) 

Unnamed: 0,lastname,firstname,username,exam1,exam2,exam3
31,Terrell,Robert,terrellrobe,99,93,60
63,Taylor,Diana,taylordian,99,90,6
69,Frank,Jessica,frankjess,95,87,92
37,Rodriguez,Michelle,rodriguezmich,94,24,50
38,Kennedy,Lisa,kennedylisa,93,19,48
...,...,...,...,...,...,...
18,Brooks,Marcus,brooksmarc,6,81,70
50,Perez,Kelli,perezkell,6,22,3
17,Klein,John,kleinjohn,3,43,88
54,Smith,Rebecca,smithrebe,3,75,55


In [203]:
data[['exam1', 'exam2', 'exam3']].mean() 

exam1    50.01
exam2    51.51
exam3    51.14
dtype: float64

In [213]:
data['average']= np.mean(data[['exam1', 'exam2', 'exam3']], axis=1) 

In [214]:
 np.mean(data[['exam1', 'exam2', 'exam3']], axis=0) 

exam1    50.01
exam2    51.51
exam3    51.14
dtype: float64

In [215]:
display(data)

Unnamed: 0,lastname,firstname,username,exam1,exam2,exam3,average
0,Larson,Melissa,larsonmeli,24,89,31,48.000000
1,Novak,Melissa,novakmeli,77,3,38,39.333333
2,Sawyer,Wendy,sawyerwend,81,59,30,56.666667
3,Schultz,Marcus,schultzmarc,59,29,67,51.666667
4,Vega,Nicholas,veganich,64,12,95,57.000000
...,...,...,...,...,...,...,...
95,Duncan,Andrew,duncanandr,59,87,34,60.000000
96,Bush,Alexander,bushalex,82,54,25,53.666667
97,Summers,Michael,summersmich,18,95,63,58.666667
98,Bowers,Sandra,bowerssand,52,8,50,36.666667


In [207]:
data.sort_values('average', ascending=False)

Unnamed: 0,lastname,firstname,username,exam1,exam2,exam3,average
48,Ramsey,John,ramseyjohn,89,95,94,92.666667
69,Frank,Jessica,frankjess,95,87,92,91.333333
31,Terrell,Robert,terrellrobe,99,93,60,84.000000
56,Ashley,Cody,ashleycody,75,69,94,79.333333
39,Henderson,Michelle,hendersonmich,91,60,86,79.000000
...,...,...,...,...,...,...,...
55,Long,Joel,longjoel,20,50,3,24.333333
30,Bailey,Amanda,baileyaman,27,4,38,23.000000
86,Cortez,Patrick,cortezpatr,18,22,2,14.000000
49,Martin,Paul,martinpaul,15,11,12,12.666667


In [217]:
data.sort_values('average', ascending=False).to_csv('output.tsv', sep='\t', index=False, header=False)

In [218]:
data.head()

Unnamed: 0,lastname,firstname,username,exam1,exam2,exam3,average
0,Larson,Melissa,larsonmeli,24,89,31,48.0
1,Novak,Melissa,novakmeli,77,3,38,39.333333
2,Sawyer,Wendy,sawyerwend,81,59,30,56.666667
3,Schultz,Marcus,schultzmarc,59,29,67,51.666667
4,Vega,Nicholas,veganich,64,12,95,57.0


In [219]:
data.head(2)

Unnamed: 0,lastname,firstname,username,exam1,exam2,exam3,average
0,Larson,Melissa,larsonmeli,24,89,31,48.0
1,Novak,Melissa,novakmeli,77,3,38,39.333333


In [221]:
data.tail()

Unnamed: 0,lastname,firstname,username,exam1,exam2,exam3,average
95,Duncan,Andrew,duncanandr,59,87,34,60.0
96,Bush,Alexander,bushalex,82,54,25,53.666667
97,Summers,Michael,summersmich,18,95,63,58.666667
98,Bowers,Sandra,bowerssand,52,8,50,36.666667
99,Fernandez,Jonathan,fernandezjona,74,80,22,58.666667


In [222]:
data.tail(3)

Unnamed: 0,lastname,firstname,username,exam1,exam2,exam3,average
97,Summers,Michael,summersmich,18,95,63,58.666667
98,Bowers,Sandra,bowerssand,52,8,50,36.666667
99,Fernandez,Jonathan,fernandezjona,74,80,22,58.666667


In [223]:
data.shape

(100, 7)

In [224]:
data.iloc[3] 

lastname         Schultz
firstname         Marcus
username     schultzmarc
exam1                 59
exam2                 29
exam3                 67
average          51.6667
Name: 3, dtype: object

In [225]:
data.columns 

Index(['lastname', 'firstname', 'username', 'exam1', 'exam2', 'exam3',
       'average'],
      dtype='object')

In [226]:
data.dtypes 

lastname      object
firstname     object
username      object
exam1          int64
exam2          int64
exam3          int64
average      float64
dtype: object

In [229]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   lastname   100 non-null    object 
 1   firstname  100 non-null    object 
 2   username   100 non-null    object 
 3   exam1      100 non-null    int64  
 4   exam2      100 non-null    int64  
 5   exam3      100 non-null    int64  
 6   average    100 non-null    float64
dtypes: float64(1), int64(3), object(3)
memory usage: 5.6+ KB


In [230]:
data.get_dtype_counts() 

AttributeError: 'DataFrame' object has no attribute 'get_dtype_counts'

In [231]:
data.describe()  

Unnamed: 0,exam1,exam2,exam3,average
count,100.0,100.0,100.0,100.0
mean,50.01,51.51,51.14,50.886667
std,26.417585,28.497561,28.291777,16.222307
min,2.0,1.0,2.0,10.333333
25%,26.75,26.5,27.75,39.583333
50%,52.0,51.5,51.0,50.833333
75%,73.0,77.0,71.5,60.333333
max,99.0,98.0,98.0,92.666667


In [232]:
data[data['exam1'].between(75, 85)] 

Unnamed: 0,lastname,firstname,username,exam1,exam2,exam3,average
1,Novak,Melissa,novakmeli,77,3,38,39.333333
2,Sawyer,Wendy,sawyerwend,81,59,30,56.666667
5,Adams,Brenda,adamsbren,80,49,60,63.0
7,Booth,Bradley,boothbrad,85,8,77,56.666667
9,Munoz,Mary,munozmary,77,41,55,57.666667
11,Paul,Philip,paulphil,79,86,60,75.0
14,Mcclain,Glenn,mcclainglen,85,98,52,78.333333
22,Gibson,Edward,gibsonedwa,84,27,21,44.0
44,Santana,Linda,santanalind,77,58,4,46.333333
45,Patton,Robert,pattonrobe,78,53,20,50.333333


In [235]:
data[data['exam1'].isin([75, 85, 95])]

Unnamed: 0,lastname,firstname,username,exam1,exam2,exam3,average
7,Booth,Bradley,boothbrad,85,8,77,56.666667
14,Mcclain,Glenn,mcclainglen,85,98,52,78.333333
56,Ashley,Cody,ashleycody,75,69,94,79.333333
69,Frank,Jessica,frankjess,95,87,92,91.333333


In [237]:
import numpy as np
import pandas as pd


employees = {
    'EmployeeID': ['EN1-26', 'EN1-33', 'EN1-35', 'EN1-36', 'EN1-38', 'EN1-40'],
    'Last_Name': ["O'Brien", "Guya", "Baranco", "Roslyn", "Schaaf", "Wing"],
    'First_Name': ["Sean", "Amy", "Steven", "Elizabeth", "Carol", "Alexandra" ]
}

df_employees = pd.DataFrame(employees)

projects = {
    'ProjectNum': [
        "30-452-T3", 
        "30-457-T3", 
        "30-482-TC", 
        "31-124-T3", 
        "31-238-TC",
        "31-238-TC2",
        "35-152-TC",
        "36-272-TC"
    ],
    'ProjectTitle': [
        "STAR manual",
        "ISO procedures",
        "Web site",
        "Employee handbook",
        "STAR prototype",
        "New catalog",
        "STAR pricing",
        "Order system"
    ]
}


df_projects = pd.DataFrame(projects)

print(df_projects)

employees_projects = {
    'EmployeeID': [
        "EN1-26", 
        "EN1-26", 
        "EN1-26", 
        "EN1-33", 
        "EN1-33", 
        "EN1-33", 
        "EN1-35", 
        "EN1-35", 
        "EN1-36", 
        "EN1-38", 
        "EN1-40", 
        "EN1-40",
    ],
    'ProjectNum' : [
        "30-452-T3",
        "30-457-T3",
        "31-124-T3",
        "30-328-TC",
        "30-452-T3",
        "32-244-T3",
        "30-452-T3",
        "31-238-TC",
        "35-152-TC",
        "36-272-TC",
        "31-238-TC2",
        "31-241-TC",
    ]


}

df_employees_projects = pd.DataFrame(employees_projects)

data = pd.merge(pd.merge(df_employees_projects, df_employees, how='left', on='EmployeeID'), df_projects,  how='left', on='ProjectNum')
display(data)

   ProjectNum       ProjectTitle
0   30-452-T3        STAR manual
1   30-457-T3     ISO procedures
2   30-482-TC           Web site
3   31-124-T3  Employee handbook
4   31-238-TC     STAR prototype
5  31-238-TC2        New catalog
6   35-152-TC       STAR pricing
7   36-272-TC       Order system


Unnamed: 0,EmployeeID,ProjectNum,Last_Name,First_Name,ProjectTitle
0,EN1-26,30-452-T3,O'Brien,Sean,STAR manual
1,EN1-26,30-457-T3,O'Brien,Sean,ISO procedures
2,EN1-26,31-124-T3,O'Brien,Sean,Employee handbook
3,EN1-33,30-328-TC,Guya,Amy,
4,EN1-33,30-452-T3,Guya,Amy,STAR manual
5,EN1-33,32-244-T3,Guya,Amy,
6,EN1-35,30-452-T3,Baranco,Steven,STAR manual
7,EN1-35,31-238-TC,Baranco,Steven,STAR prototype
8,EN1-36,35-152-TC,Roslyn,Elizabeth,STAR pricing
9,EN1-38,36-272-TC,Schaaf,Carol,Order system


In [238]:
import pandas as pd
data = pd.read_csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/airline-safety/airline-safety.csv")

In [240]:
display(data)
data.set_index('airline', inplace=True)

Unnamed: 0,airline,avail_seat_km_per_week,incidents_85_99,fatal_accidents_85_99,fatalities_85_99,incidents_00_14,fatal_accidents_00_14,fatalities_00_14
0,Aer Lingus,320906734,2,0,0,0,0,0
1,Aeroflot*,1197672318,76,14,128,6,1,88
2,Aerolineas Argentinas,385803648,6,0,0,1,0,0
3,Aeromexico*,596871813,3,1,64,5,0,0
4,Air Canada,1865253802,2,0,0,2,0,0
5,Air France,3004002661,14,4,79,6,2,337
6,Air India*,869253552,2,1,329,4,1,158
7,Air New Zealand*,710174817,3,0,0,5,1,7
8,Alaska Airlines*,965346773,5,0,0,5,1,88
9,Alitalia,698012498,7,2,50,4,0,0


In [241]:
display(data)

Unnamed: 0_level_0,avail_seat_km_per_week,incidents_85_99,fatal_accidents_85_99,fatalities_85_99,incidents_00_14,fatal_accidents_00_14,fatalities_00_14
airline,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Aer Lingus,320906734,2,0,0,0,0,0
Aeroflot*,1197672318,76,14,128,6,1,88
Aerolineas Argentinas,385803648,6,0,0,1,0,0
Aeromexico*,596871813,3,1,64,5,0,0
Air Canada,1865253802,2,0,0,2,0,0
Air France,3004002661,14,4,79,6,2,337
Air India*,869253552,2,1,329,4,1,158
Air New Zealand*,710174817,3,0,0,5,1,7
Alaska Airlines*,965346773,5,0,0,5,1,88
Alitalia,698012498,7,2,50,4,0,0


In [245]:
data.loc['Xiamen Airlines']

avail_seat_km_per_week    430462962
incidents_85_99                   9
fatal_accidents_85_99             1
fatalities_85_99                 82
incidents_00_14                   2
fatal_accidents_00_14             0
fatalities_00_14                  0
Name: Xiamen Airlines, dtype: int64

In [246]:
# 400, 380 normal, 20 diseased
380/400*100

95.0