In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df = pd.read_table('british-english', keep_default_na=False, header=None)
df.columns = ['words']


In [4]:
#acceccing first entry:
df['words'][0]
#or
df.iloc[0, 0]
#or
df.loc[0, 'words']

#accessing all words starting by a:
mask = df['words'].str.startswith('A')
#or
mask = df['words'].str[0] == 'A'

df[mask]

#accessing all words starting with upper- or lowercase a:

#converting everything to lowercase:
mask = df['words'].str.lower().str[0] == 'a'
#converting everything into uppercase:
mask = df['words'].str.upper().str[0] == 'A'
#or checking for two diff. things:
mask1 = df['words'].str[0] == 'A'
mask2 = df['words'].str[0] == 'a'

#mask1 or mask2
mask = mask1 | mask2 
#mask1 and mask2
mask = mask1 & mask2

In [5]:
#adding columns

np_array = np.arange(6).reshape((3,2))
df = pd.DataFrame(np_array, index=['a', 'b', 'c'], columns=['col1', 'col2'])
print(df)

# Copying col1, but skipping first row. Pandas will get around the missing value by inserting a Nan value
df.loc[:,'col3'] = df.loc['b':,'col1']
print(df)       

   col1  col2
a     0     1
b     2     3
c     4     5
   col1  col2  col3
a     0     1   NaN
b     2     3   2.0
c     4     5   4.0


In [6]:
#adding rows by assigning to a relevant index

np_array = np.arange(6).reshape((3,2))
df = pd.DataFrame(np_array,
                  index=['a', 'b', 'c'],
                  columns=['col1', 'col2'])
# Adding row labeled 'd' - copy of `a`
df.loc['d',:] = df.loc['a',:]
print(df)

   col1  col2
a   0.0   1.0
b   2.0   3.0
c   4.0   5.0
d   0.0   1.0


In [7]:
#Aligning dataframes

np_array1 = np.arange(9).reshape((3,3))
df1 = pd.DataFrame(np_array1)
df2 = pd.DataFrame(np_array1[:2,:2])

# In pandas operations can be done between two datasets of unequal size

df1 + df2

#It will just return NaN for the missing values



Unnamed: 0,0,1,2
0,0.0,2.0,
1,6.0,8.0,
2,,,


In [8]:
#Once inside of a pandas dataframe, we can do a lot of operations on the data

#we have .mean(), .std(), .var(), .min(), .max(), .sumsum(), .sumprod(), etc.
#we specify the axis along which we want to perform the operation, 0 is the row axis
#and 1 is the column axis


#there is a default skipna built in to the operations, which you can access and equal to False

(df1+df2).sum(axis=0, min_count=1) #min_conut=1 means minimum one number in the column must be a number

(df1+df2).isna() #returns a boolean dataframe with True for NaN values

#we can also use the .fillna() method to fill in NaN values with a specific value
    #we can also .dropna() to remove NaN values
    #you can interpolate the values with .interpolate(), take the interpolation between the two neighbours



Unnamed: 0,0,1,2
0,False,False,True
1,False,False,True
2,True,True,True


In [9]:
#sorting dataframes

df.sort_index(axis=0, ascending=False) #sorts the rows in descending order

df.sort_values(by='col1') #sorts the rows by the values in col1
df.sort_values(by=['col1', 'col2']) #sorts the rows by the values in col1 and then col2

Unnamed: 0,col1,col2
a,0.0,1.0
d,0.0,1.0
b,2.0,3.0
c,4.0,5.0


In [10]:
#merging dataframes

df1 = pd.DataFrame({'key': ['a', 'b', 'c', 'd'],
                    'value': np.random.randn(4)})
df2 = pd.DataFrame({'key': ['b', 'd', 'e', 'f'],
                    'value': np.random.randn(4)})
merged = pd.merge(df1, df2, on='key') #merges the two dataframes on the key column

print(merged) #see it gives only the overlap

union = pd.merge(df1, df2, on='key', how='outer') #gives the union of the two dataframes

print(union) #see it gives all the values, with nan for the missing values

  key   value_x   value_y
0   b -0.864750  0.526397
1   d -0.405102 -0.660585
  key   value_x   value_y
0   a  0.811575       NaN
1   b -0.864750  0.526397
2   c -1.357731       NaN
3   d -0.405102 -0.660585
4   e       NaN  0.831087
5   f       NaN -2.019015


In [11]:
df_british = pd.read_table('british-english', keep_default_na=False, header=None)
df_british.columns = ['words_english']

df_american = pd.read_table('american-english', keep_default_na=False, header=None)
df_american.columns = ['words_american']

df_merged = pd.merge(df_british, df_american, left_on='words_english', right_on='words_american', how='left', indicator=True)

merge_mask = df_merged['_merge'] == 'left_only'

print(df_merged['words_english'][merge_mask])


2                AMD
3              AMD's
7           Aachen's
11           Aaron's
13           Abbas's
             ...    
101057     workflows
101672        zapper
101673      zapper's
101674       zappers
101808    Ångström's
Name: words_english, Length: 5357, dtype: object


In [12]:
#grouping dataframes

df = pd.DataFrame({'name' : pd.Categorical(['Bob', 'Alice', 'Anna']),
                   'gender' : pd.Categorical(['male', 'female', 'female']),
                   'height': [170, 180, 165]})
gender_grps = df.groupby('gender')
print(gender_grps) #mysterious

#you can also group dynamically
df.groupby(df['name'].str.len()) #groups by the length of the name

#we can also iterate over elements in a group
for name, grp in gender_grps:
    print(name)
    print(grp)

#we can also calculate summary statistics for each group
gender_grps['height'].mean() #gives the mean height
gender_grps.agg({'height':np.mean}) # Returns a DataFrame

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1224d8610>
female
    name  gender  height
1  Alice  female     180
2   Anna  female     165
male
  name gender  height
0  Bob   male     170


  gender_grps = df.groupby('gender')
  gender_grps.agg({'height':np.mean}) # Returns a DataFrame


Unnamed: 0_level_0,height
gender,Unnamed: 1_level_1
female,172.5
male,170.0


In [13]:
df_b = pd.read_table('british-english', keep_default_na=False, header=None)

df_b.columns = ['words']

alphabetical_groups = df_b.groupby(df_b['words'].str[0]) #gives the number of words starting with each letter



In [21]:
lucky_letter = 'c'

for name, grp in alphabetical_groups:
    if name == lucky_letter:
        lucky_group = grp
    else: 
        continue

print(lucky_group) #non-indexable unfortunately :( 

           words
28618          c
28619        cab
28620      cab's
28621      cabal
28622    cabal's
...          ...
36752     czar's
36753    czarina
36754  czarina's
36755   czarinas
36756      czars

[8139 rows x 1 columns]
