# Importing NumPy, Pandas

In [2]:
import numpy as np
import pandas as pd

# Simple Usage of Pandas, Framework

In [3]:
# making a dictionary here 
dict1 = {
    "name" : ["atif", "arjit", "vitalsigns", "eminem", "akon"],
    "narks" : [34,56,78,89,90],
    "city": ["lahore", "delhi", "faisalabad", "newyork", "brazil"]
}

In [4]:
# made the dataframe - for faster indexing it made it like an excel sheet
# so you can analyze the data more correctly - big data
df1 = pd.DataFrame(dict1)

In [5]:
df1

Unnamed: 0,name,narks,city
0,atif,34,lahore
1,arjit,56,delhi
2,vitalsigns,78,faisalabad
3,eminem,89,newyork
4,akon,90,brazil


In [6]:
# for exporting this dataframe into excel/csv
df1.to_csv('tempdata.csv')

In [7]:
# if you don't want to include the index
df1.to_csv("tempdata_false_again.csv", index=False)

In [8]:
# starting n rows
df1.head(2)

Unnamed: 0,name,narks,city
0,atif,34,lahore
1,arjit,56,delhi


In [9]:
# end n rows
df1.tail(2)

Unnamed: 0,name,narks,city
3,eminem,89,newyork
4,akon,90,brazil


In [10]:
# it runs only on the numerical values, will do the statistical analysis
df1.describe()

Unnamed: 0,narks
count,5.0
mean,69.4
std,24.058263
min,34.0
25%,56.0
50%,78.0
75%,89.0
max,90.0


In [11]:
# for reading any csv
red = pd.read_csv("tempdata_false_again.csv")

In [12]:
red

Unnamed: 0,name,narks,city
0,atif,34,lahore
1,arjit,56,delhi
2,vitalsigns,78,faisalabad
3,eminem,89,newyork
4,akon,90,brazil


In [13]:
# for getting the specific column
red['name']

0          atif
1         arjit
2    vitalsigns
3        eminem
4          akon
Name: name, dtype: object

In [14]:
# for gettign the column with specific index
red['narks'][0]

34

In [15]:
# for changing the index
# red.index = ['first', 'second', 'third', 'fourth', 'fifth']

In [17]:
# Pandas has basically 2 types of Data Structures:
# Series: 1D-array for holding similar types of data
# DataFrame: 1D-array for holding the different types of data

type(red['narks'])
type(red)

pandas.core.frame.DataFrame

# Making Difference between Series and Dataframe

### Series

In [18]:
ser = pd.Series(np.random.rand(34))

In [19]:
ser

0     0.769238
1     0.536542
2     0.251899
3     0.157970
4     0.962338
5     0.447774
6     0.647638
7     0.397903
8     0.894408
9     0.168324
10    0.445117
11    0.619570
12    0.937629
13    0.587533
14    0.470318
15    0.177032
16    0.866220
17    0.120702
18    0.311833
19    0.689047
20    0.020600
21    0.345674
22    0.541581
23    0.003800
24    0.428381
25    0.506589
26    0.674284
27    0.233817
28    0.115579
29    0.243329
30    0.134840
31    0.191282
32    0.864785
33    0.343178
dtype: float64

In [20]:
type(ser)

pandas.core.series.Series

### Pandas

In [21]:
# here i am making 334 rows with 5 columns, and filling it using arange(334) : 0 to 334 whixh are the index, rand makes sure to fill in matrix with [0,1)
newdf = pd.DataFrame(np.random.rand(334,5), index = np.arange(334))

In [23]:
newdf
newdf.head()
newdf.tail()

Unnamed: 0,0,1,2,3,4
0,0.796144,0.404274,0.764338,0.984271,0.314296
1,0.760688,0.367562,0.843346,0.290815,0.000209
2,0.542716,0.060726,0.392223,0.000017,0.178171
3,0.503266,0.853110,0.526866,0.552691,0.932347
4,0.147099,0.368250,0.101266,0.001634,0.101801
...,...,...,...,...,...
329,0.021111,0.320712,0.472087,0.393549,0.326203
330,0.080108,0.976995,0.252372,0.022866,0.277557
331,0.080421,0.428622,0.027385,0.897962,0.273310
332,0.573527,0.800608,0.741691,0.680488,0.885140


In [24]:
newdf.describe()

Unnamed: 0,0,1,2,3,4
count,334.0,334.0,334.0,334.0,334.0
mean,0.502625,0.489859,0.487105,0.480514,0.518333
std,0.296015,0.28852,0.280101,0.292127,0.283835
min,0.007377,0.004599,0.006634,1.7e-05,0.000209
25%,0.247522,0.244424,0.250102,0.224912,0.289721
50%,0.503487,0.4927,0.485771,0.469804,0.504664
75%,0.760919,0.725696,0.733702,0.715652,0.781068
max,0.999173,0.997121,0.998797,0.998454,0.987185


In [27]:
newdf.dtypes

0    float64
1    float64
2    float64
3    float64
4    float64
dtype: object

### Now in df difference between simple indexing, and loc

In [29]:
# if i put 'abubakar'on the [0][0] of the newdf
# it covert the first column as in whole due to one value to object
newdf[0][0] = "abubakar"

In [40]:
newdf.dtypes

0     object
1    float64
2    float64
3    float64
4    float64
dtype: object

In [32]:
newdf.head(3)

Unnamed: 0,0,1,2,3,4
0,abubakar,0.404274,0.764338,0.984271,0.314296
1,0.760688,0.367562,0.843346,0.290815,0.000209
2,0.542716,0.060726,0.392223,1.7e-05,0.178171


In [38]:
# convert the dataframe to numpy array
newdf.to_numpy()

array([[0.69, 0.4042735870226455, 0.7643376960692345, 0.9842708431023817,
        0.3142957944682725],
       [0.7606879360670162, 0.36756180110770775, 0.8433455640220189,
        0.2908152432392793, 0.00020900462260031727],
       [0.5427160185787802, 0.060726443238090067, 0.39222325348003695,
        1.72774242628293e-05, 0.17817070327615003],
       ...,
       [0.08042123369812726, 0.42862184946443205, 0.027385310799475704,
        0.8979615578972627, 0.27331003189785796],
       [0.5735266807726228, 0.800607934755436, 0.7416914268517295,
        0.6804883444440951, 0.8851397208611306],
       [0.6560508515152886, 0.49248746962451173, 0.06673135522773377,
        0.18185763755500728, 0.5625616648135895]], dtype=object)

In [34]:
type(newdf)

pandas.core.frame.DataFrame

In [35]:
newdf[0][0] = 0.69

In [36]:
newdf

Unnamed: 0,0,1,2,3,4
0,0.69,0.404274,0.764338,0.984271,0.314296
1,0.760688,0.367562,0.843346,0.290815,0.000209
2,0.542716,0.060726,0.392223,0.000017,0.178171
3,0.503266,0.853110,0.526866,0.552691,0.932347
4,0.147099,0.368250,0.101266,0.001634,0.101801
...,...,...,...,...,...
329,0.021111,0.320712,0.472087,0.393549,0.326203
330,0.080108,0.976995,0.252372,0.022866,0.277557
331,0.080421,0.428622,0.027385,0.897962,0.273310
332,0.573527,0.800608,0.741691,0.680488,0.885140


In [39]:
type(newdf)

array([[0.69, 0.4042735870226455, 0.7643376960692345, 0.9842708431023817,
        0.3142957944682725],
       [0.7606879360670162, 0.36756180110770775, 0.8433455640220189,
        0.2908152432392793, 0.00020900462260031727],
       [0.5427160185787802, 0.060726443238090067, 0.39222325348003695,
        1.72774242628293e-05, 0.17817070327615003],
       ...,
       [0.08042123369812726, 0.42862184946443205, 0.027385310799475704,
        0.8979615578972627, 0.27331003189785796],
       [0.5735266807726228, 0.800607934755436, 0.7416914268517295,
        0.6804883444440951, 0.8851397208611306],
       [0.6560508515152886, 0.49248746962451173, 0.06673135522773377,
        0.18185763755500728, 0.5625616648135895]], dtype=object)

In [41]:
newdf.tail()

Unnamed: 0,0,1,2,3,4
329,0.021111,0.320712,0.472087,0.393549,0.326203
330,0.080108,0.976995,0.252372,0.022866,0.277557
331,0.080421,0.428622,0.027385,0.897962,0.27331
332,0.573527,0.800608,0.741691,0.680488,0.88514
333,0.656051,0.492487,0.066731,0.181858,0.562562


In [42]:
newdf.sort_index(axis=0, ascending=False)

Unnamed: 0,0,1,2,3,4
333,0.656051,0.492487,0.066731,0.181858,0.562562
332,0.573527,0.800608,0.741691,0.680488,0.885140
331,0.080421,0.428622,0.027385,0.897962,0.273310
330,0.080108,0.976995,0.252372,0.022866,0.277557
329,0.021111,0.320712,0.472087,0.393549,0.326203
...,...,...,...,...,...
4,0.147099,0.368250,0.101266,0.001634,0.101801
3,0.503266,0.853110,0.526866,0.552691,0.932347
2,0.542716,0.060726,0.392223,0.000017,0.178171
1,0.760688,0.367562,0.843346,0.290815,0.000209


In [44]:
newdf.head()

Unnamed: 0,0,1,2,3,4
0,0.69,0.404274,0.764338,0.984271,0.314296
1,0.760688,0.367562,0.843346,0.290815,0.000209
2,0.542716,0.060726,0.392223,1.7e-05,0.178171
3,0.503266,0.85311,0.526866,0.552691,0.932347
4,0.147099,0.36825,0.101266,0.001634,0.101801


In [45]:
newdf.sort_index(axis=1, ascending=False)

Unnamed: 0,4,3,2,1,0
0,0.314296,0.984271,0.764338,0.404274,0.69
1,0.000209,0.290815,0.843346,0.367562,0.760688
2,0.178171,0.000017,0.392223,0.060726,0.542716
3,0.932347,0.552691,0.526866,0.853110,0.503266
4,0.101801,0.001634,0.101266,0.368250,0.147099
...,...,...,...,...,...
329,0.326203,0.393549,0.472087,0.320712,0.021111
330,0.277557,0.022866,0.252372,0.976995,0.080108
331,0.273310,0.897962,0.027385,0.428622,0.080421
332,0.885140,0.680488,0.741691,0.800608,0.573527


In [46]:
newdf.loc[0,0] = 0.65432

In [47]:
newdf

Unnamed: 0,0,1,2,3,4
0,0.65432,0.404274,0.764338,0.984271,0.314296
1,0.760688,0.367562,0.843346,0.290815,0.000209
2,0.542716,0.060726,0.392223,0.000017,0.178171
3,0.503266,0.853110,0.526866,0.552691,0.932347
4,0.147099,0.368250,0.101266,0.001634,0.101801
...,...,...,...,...,...
329,0.021111,0.320712,0.472087,0.393549,0.326203
330,0.080108,0.976995,0.252372,0.022866,0.277557
331,0.080421,0.428622,0.027385,0.897962,0.273310
332,0.573527,0.800608,0.741691,0.680488,0.885140


In [None]:
# for dropping the column or row from the dataframe 
# axis1 (horizontal)
# newdf.drop(0,axis=1)
# axis2 (vertical)
# newdf.drop(0,axis=0)

In [48]:
# returning the newdataframe as copy of newdf
newdf.loc[[1,2], [3,4]]

Unnamed: 0,3,4
1,0.290815,0.000209
2,1.7e-05,0.178171


In [51]:
# here we explicitly mentioned the df we need for values in columns 0 and 2
newdf.loc[(newdf[0]<0.3) & (newdf[2]>0.1), [0,2]]

Unnamed: 0,0,2
4,0.147099,0.101266
6,0.171845,0.468255
7,0.281475,0.221098
8,0.270186,0.135626
10,0.059351,0.699725
...,...,...
318,0.271425,0.477666
327,0.108891,0.756038
328,0.036219,0.445040
329,0.021111,0.472087


In [52]:
# if you want the value at that index
newdf.iloc[0,0]

0.65432

In [53]:
# isnull return true where the value is 0 but as in here all are false bcz no value is being 0 here
newdf.isnull()

Unnamed: 0,0,1,2,3,4
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
329,False,False,False,False,False
330,False,False,False,False,False
331,False,False,False,False,False
332,False,False,False,False,False


### Some other functions

In [56]:
# it will count after grouping
red['name'].value_counts()

name
atif          1
arjit         1
vitalsigns    1
eminem        1
akon          1
Name: count, dtype: int64