In [3]:
import pandas as pd
import numpy as np



In [4]:
# create a small dictionary with different data types

dft = pd.DataFrame(dict(A = np.random.rand(3),
                        B = 1,
                        C = 'foo',
                        D = pd.Timestamp('20010102'),
                        E = pd.Series([1.0]*3).astype('float32'),
                                F = False,
                                G = pd.Series([1]*3,dtype='int8')))

dft

Unnamed: 0,A,B,C,D,E,F,G
0,0.714093,1,foo,2001-01-02,1.0,False,1
1,0.663399,1,foo,2001-01-02,1.0,False,1
2,0.595014,1,foo,2001-01-02,1.0,False,1


In [5]:
# There is a really easy way to see what kind of dtypes 
# are in each column. 

dft.dtypes

A           float64
B             int64
C            object
D    datetime64[ns]
E           float32
F              bool
G              int8
dtype: object

In [6]:
# If a pandas object contains data multiple dtypes IN A 
# SINGLE COLUMN, the dtype of the column will be chosen 
# to accommodate all of the data types (object is the 
# most general).
# these ints are coerced to floats

pd.Series([1, 2, 3, 4, 5, 6.])

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
5    6.0
dtype: float64

In [7]:
# string data forces an ``object`` dtype

pd.Series([1, 2, 3, 6., 'foo'])

0      1
1      2
2      3
3      6
4    foo
dtype: object

In [8]:
# The method get_dtype_counts() will return the number 
# of columns of each type in a DataFrame:

dft.get_dtype_counts()

bool              1
datetime64[ns]    1
float32           1
float64           1
int64             1
int8              1
object            1
dtype: int64

In [10]:
# create a small data frame. 

df = pd.DataFrame(np.random.randn(5, 4), columns=['a', 'b', 'c', 'd'])
df

Unnamed: 0,a,b,c,d
0,1.095233,-0.190617,1.645504,0.285175
1,0.800069,1.410802,1.570549,-1.730656
2,-0.98156,0.275497,1.822243,0.092416
3,0.983433,-1.06284,-1.346694,-2.363093
4,1.664715,-1.301966,1.218833,-0.724404


In [11]:
# Use df.apply to find the square root of all the values. 
# NaN means not a number

df.apply(np.sqrt)

Unnamed: 0,a,b,c,d
0,1.046534,,1.282772,0.534018
1,0.894466,1.187772,1.253215,
2,,0.524879,1.349905,0.304
3,0.991682,,,
4,1.290238,,1.104008,


In [50]:
# find the mean of all of the columns

df.apply(np.mean, axis=0)

a   -0.676355
b    0.601250
c    0.606017
d   -0.319823
dtype: float64

In [12]:
# find the mean of all of the rows

df.apply(np.mean, axis=1)

0    0.708824
1    0.512691
2    0.302149
3   -0.947299
4    0.214294
dtype: float64

In [13]:
# Let's create a random array with 50 numbers, ranging 
# from 0 to 7.

data = np.random.randint(0, 7, size = 50)
data

array([1, 0, 0, 3, 5, 0, 1, 4, 1, 4, 1, 2, 3, 5, 2, 6, 4, 6, 1, 3, 3, 3, 2,
       4, 3, 4, 1, 3, 1, 2, 2, 1, 1, 0, 4, 3, 5, 0, 6, 5, 1, 3, 1, 1, 0, 0,
       2, 5, 2, 0])

In [14]:
# convert the array into a series

s = pd.Series(data)

In [15]:
# How many of each number is there in the series? Enter 
# value_counts()

pd.value_counts(s)

1    12
3     9
0     8
2     7
4     6
5     5
6     3
dtype: int64

In [108]:
rock = pd.read_csv('C:/Users/DK/DSI-course-materials/curriculum/04-lessons/week-02/2.3-lesson/code/rock.csv')
rock
#Reads in the "rock" csv file in dataframe format





Unnamed: 0,Song Clean,ARTIST CLEAN,Release Year,COMBINED,First?,Year?,PlayCount,F*G
0,Caught Up in You,.38 Special,1982,Caught Up in You by .38 Special,1,1,82,82
1,Fantasy Girl,.38 Special,,Fantasy Girl by .38 Special,1,0,3,0
2,Hold On Loosely,.38 Special,1981,Hold On Loosely by .38 Special,1,1,85,85
3,Rockin' Into the Night,.38 Special,1980,Rockin' Into the Night by .38 Special,1,1,18,18
4,Art For Arts Sake,10cc,1975,Art For Arts Sake by 10cc,1,1,1,1
5,Kryptonite,3 Doors Down,2000,Kryptonite by 3 Doors Down,1,1,13,13
6,Loser,3 Doors Down,2000,Loser by 3 Doors Down,1,1,1,1
7,When I'm Gone,3 Doors Down,2002,When I'm Gone by 3 Doors Down,1,1,6,6
8,What's Up?,4 Non Blondes,1992,What's Up? by 4 Non Blondes,1,1,3,3
9,Take On Me,a-ha,1985,Take On Me by a-ha,1,1,1,1


In [109]:
pd.DataFrame.describe(rock)
#Calculates the descriptive statistics for the "rock" dataframe

Unnamed: 0,First?,Year?,PlayCount,F*G
count,2230.0,2230.0,2230.0,2230.0
mean,1.0,0.741256,16.872646,15.04843
std,0.0,0.438043,25.302972,25.288366
min,1.0,0.0,0.0,0.0
25%,1.0,0.0,1.0,0.0
50%,1.0,1.0,4.0,3.0
75%,1.0,1.0,21.0,18.0
max,1.0,1.0,142.0,142.0


In [110]:
rock2 = pd.DataFrame.dropna(rock)
#Drops all rows that contain empty cells
rock2

Unnamed: 0,Song Clean,ARTIST CLEAN,Release Year,COMBINED,First?,Year?,PlayCount,F*G
0,Caught Up in You,.38 Special,1982,Caught Up in You by .38 Special,1,1,82,82
2,Hold On Loosely,.38 Special,1981,Hold On Loosely by .38 Special,1,1,85,85
3,Rockin' Into the Night,.38 Special,1980,Rockin' Into the Night by .38 Special,1,1,18,18
4,Art For Arts Sake,10cc,1975,Art For Arts Sake by 10cc,1,1,1,1
5,Kryptonite,3 Doors Down,2000,Kryptonite by 3 Doors Down,1,1,13,13
6,Loser,3 Doors Down,2000,Loser by 3 Doors Down,1,1,1,1
7,When I'm Gone,3 Doors Down,2002,When I'm Gone by 3 Doors Down,1,1,6,6
8,What's Up?,4 Non Blondes,1992,What's Up? by 4 Non Blondes,1,1,3,3
9,Take On Me,a-ha,1985,Take On Me by a-ha,1,1,1,1
11,Back In Black,AC/DC,1980,Back In Black by AC/DC,1,1,97,97


In [111]:
pd.DataFrame.describe(rock2)
#Calculates the new descriptive statistics after the empty cell rows are removed

Unnamed: 0,First?,Year?,PlayCount,F*G
count,1653.0,1653.0,1653.0,1653.0
mean,1.0,1.0,20.30127,20.30127
std,0.0,0.0,27.498338,27.498338
min,1.0,1.0,0.0,0.0
25%,1.0,1.0,2.0,2.0
50%,1.0,1.0,7.0,7.0
75%,1.0,1.0,28.0,28.0
max,1.0,1.0,142.0,142.0
