In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# static inline table
df = pd.Series([7,8,9])
df

0    7
1    8
2    9
dtype: int64

In [3]:
# static inline table
myDictionary = {'dte':['2017-01-01','2017-01-02']
                 ,'myVals': [5,6]   
               }
df = pd.DataFrame(myDictionary)

In [4]:
# generate larger table
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=['A', 'B', 'C', 'D']
#                   columns=list('ABCD')
                 )
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.352364,0.774991,-1.012945,-1.046242
2013-01-02,-0.614939,-0.511204,0.381663,-0.9325
2013-01-03,-0.268382,0.285572,0.830174,-0.435975
2013-01-04,-0.43113,-0.777938,0.150168,-0.047943
2013-01-05,-2.704179,0.370476,0.373329,-0.048463
2013-01-06,-0.285326,-0.097215,-0.270931,0.928393


In [11]:
print('{},{}'.format(type(df.index.values),type(df.A)))

<class 'numpy.ndarray'>,<class 'pandas.core.series.Series'>


In [5]:
# stores # rows to numRows
numRows, numColumns = df.shape
df.columns #prints columns
df.dtypes #all types

A    float64
B    float64
C    float64
D    float64
dtype: object

In [6]:
df[1:3] #first row is index 0; prints 2nd and 3rd rows
df['20130104':'20130104'] #prints rows matching index value

Unnamed: 0,A,B,C,D
2013-01-04,-0.729123,-0.555492,-1.414621,1.219469


In [7]:
df.A #print column A
df[['A',"D"]] # print columns A and D
df.loc['20130104':'20130104',['A','B']] #print certain rows and columns
df.at[dates[0],'A'] #A column value for first record in dates, # i don't quite get this one
df.iloc[0,0] # get first value
df.iloc[0:3,2:4] # get certain rows and columns

Unnamed: 0,C,D
2013-01-01,0.060781,0.741392
2013-01-02,-0.43581,1.596806
2013-01-03,-1.056752,1.226914


In [8]:
df.drop(['B', 'C'], axis=1)
# df.drop(columns=['B', 'C']) #doesn't work

Unnamed: 0,A,D
2013-01-01,-2.064379,0.741392
2013-01-02,-0.462518,1.596806
2013-01-03,1.261599,1.226914
2013-01-04,-0.729123,1.219469
2013-01-05,-0.117356,0.374319
2013-01-06,-1.337533,-0.889246


In [9]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,-2.064379,0.133215,0.060781,0.741392,
2013-01-02,-0.462518,-0.897317,-0.43581,1.596806,
2013-01-03,1.261599,-0.484191,-1.056752,1.226914,
2013-01-04,-0.729123,-0.555492,-1.414621,1.219469,


In [10]:
df1['F']=df1.index + 5 #create new column
df1.fillna(value=5) #sets all na to 5

Unnamed: 0,A,B,C,D,E,F
2013-01-01,-2.064379,0.133215,0.060781,0.741392,5.0,2013-01-06
2013-01-02,-0.462518,-0.897317,-0.43581,1.596806,5.0,2013-01-07
2013-01-03,1.261599,-0.484191,-1.056752,1.226914,5.0,2013-01-08
2013-01-04,-0.729123,-0.555492,-1.414621,1.219469,5.0,2013-01-09


In [11]:
# df1['Grp2'] = pd.Series([45,64,56,67])
# df1['Grp1']= pd.DataFrame ({'Grp1':['Us','Ca']*2})
# df1

In [12]:
df1.describe()

Unnamed: 0,A,B,C,D,E
count,4.0,4.0,4.0,4.0,0.0
mean,-0.498606,-0.450946,-0.711601,1.196145,
std,1.366798,0.429156,0.654727,0.350628,
min,-2.064379,-0.897317,-1.414621,0.741392,
25%,-1.062937,-0.640948,-1.14622,1.099949,
50%,-0.595821,-0.519842,-0.746281,1.223191,
75%,-0.031489,-0.32984,-0.311662,1.319387,
max,1.261599,0.133215,0.060781,1.596806,


In [13]:
df1[df1>1]  #all values > 1
df1[df1.A>0] # records where A > 0
df1

Unnamed: 0,A,B,C,D,E,F
2013-01-01,-2.064379,0.133215,0.060781,0.741392,,2013-01-06
2013-01-02,-0.462518,-0.897317,-0.43581,1.596806,,2013-01-07
2013-01-03,1.261599,-0.484191,-1.056752,1.226914,,2013-01-08
2013-01-04,-0.729123,-0.555492,-1.414621,1.219469,,2013-01-09


In [14]:
df[df.A==df['A'].max()] # find record where A is max
df[['A','C']][df.A==df['A'].max()] # print certain cols where A is max
df[['A','C']][df.A==df.A.max()] # print certain cols where A is max

Unnamed: 0,A,C
2013-01-03,1.261599,-1.056752


# various operations
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html

https://github.com/codebasics/py/tree/master/pandas

In [15]:
df1 = df.copy()
df1.set_index('A',inplace=True) #set index to column A
df1.reset_index() #set index back to autonum

Unnamed: 0,A,B,C,D
0,-2.064379,0.133215,0.060781,0.741392
1,-0.462518,-0.897317,-0.43581,1.596806
2,1.261599,-0.484191,-1.056752,1.226914
3,-0.729123,-0.555492,-1.414621,1.219469
4,-0.117356,-0.883359,-0.218784,0.374319
5,-1.337533,1.551597,-0.658686,-0.889246


In [16]:
df1.head()

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-2.064379,0.133215,0.060781,0.741392
-0.462518,-0.897317,-0.43581,1.596806
1.261599,-0.484191,-1.056752,1.226914
-0.729123,-0.555492,-1.414621,1.219469
-0.117356,-0.883359,-0.218784,0.374319


In [31]:
df1.rename_axis('AAA')  #renames index to AAA
df1.rename(columns = {'B':'BB'} ) # inplace = True, renames column

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-2.064379,0.133215,0.060781,0.741392
-0.462518,-0.897317,-0.43581,1.596806
1.261599,-0.484191,-1.056752,1.226914
-0.729123,-0.555492,-1.414621,1.219469
-0.117356,-0.883359,-0.218784,0.374319


In [None]:
# apply
# unique
# normalize time
# write to hdf
# T for transpose
# astype

https://pandas.pydata.org/pandas-docs/stable/tutorials.html
axis refers to the dimension of the array, in the case of pd.DataFrames axis=0 is the dimension that points downwards and axis=1 the one that points to the right.

In [3]:
# df = pd.read_table('http://datashaping.com/passwords.txt',skiprows=16,names=(['pw']),quoting=3)  # 

df = pd.read_csv('https://data.cityofnewyork.us/resource/fhrw-4uyv.csv',nrows=10)
df

Unnamed: 0,:@computed_region_92fq_4b7q,:@computed_region_efsh_h5xi,:@computed_region_f5dn_yrer,:@computed_region_yeji_bk3q,address_type,agency,agency_name,borough,bridge_highway_direction,bridge_highway_name,...,school_state,school_zip,status,street_name,taxi_company_borough,taxi_pick_up_location,unique_key,vehicle_type,x_coordinate_state_plane,y_coordinate_state_plane
0,7,17621,10,2,ADDRESS,DSNY,Department of Sanitation,BROOKLYN,,,...,Unspecified,Unspecified,Assigned,65 STREET,,,38077974,,984218,167034
1,34,14788,54,3,ADDRESS,DSNY,Department of Sanitation,QUEENS,,,...,Unspecified,Unspecified,Assigned,52 AVENUE,,,38080393,,1012641,206564
2,7,17621,10,2,ADDRESS,DSNY,Department of Sanitation,BROOKLYN,,,...,Unspecified,Unspecified,Assigned,62 STREET,,,38080328,,983567,168562
3,7,17621,10,2,ADDRESS,DSNY,Department of Sanitation,BROOKLYN,,,...,Unspecified,Unspecified,Assigned,10 AVENUE,,,38075653,,982462,169387
4,7,18180,10,2,ADDRESS,DSNY,Department of Sanitation,BROOKLYN,,,...,Unspecified,Unspecified,Assigned,62 STREET,,,38074441,,981419,170267
5,13,10369,4,1,ADDRESS,NYPD,New York City Police Department,Unspecified,,,...,Unspecified,Unspecified,Open,HAMILTON AVENUE,,,38081398,,961847,174188
6,28,14786,40,3,ADDRESS,TLC,Taxi and Limousine Commission,QUEENS,,,...,Unspecified,Unspecified,Open,QUEENS BOULEVARD,,,38078943,,1027259,202193
7,29,11606,24,5,ADDRESS,NYPD,New York City Police Department,BRONX,,,...,Unspecified,Unspecified,Open,GRAND AVENUE,,,38078744,,1011143,253882
8,51,12078,71,4,ADDRESS,DOHMH,Department of Health and Mental Hygiene,MANHATTAN,,,...,Unspecified,Unspecified,Open,EAST 30 STREET,,,38081424,,991172,209291
9,15,13826,32,2,ADDRESS,NYPD,New York City Police Department,BROOKLYN,,,...,Unspecified,Unspecified,Open,EAST 27 STREET,,,38074026,,1000091,154518


In [4]:
%who

df	 np	 pd	 plt	 


In [19]:
def testfn(i):
    return pd.Series([i])

dfList= [testfn(i) for i in range(1, 13)]  #generate a list of Series
df2 = pd.concat(dfList)  
df2

0     1
0     2
0     3
0     4
0     5
0     6
0     7
0     8
0     9
0    10
0    11
0    12
dtype: int64