# 2.1.1 Object creation

See the Intro to data structures section.
Creating a Series by passing a list of values, letting pandas create a default integer index:

In [1]:
import pandas as pd
import numpy as np

s = pd.Series([1, 3, 5, np.nan, 6, 8])

In [3]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

Creating a DataFrame by passing a NumPy array, with a datetime index and labeled columns:

In [18]:
dates = pd.date_range("20130101", periods=20)

In [20]:
dates    #generating dates values in yyyy-mm-dd 

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06', '2013-01-07', '2013-01-08',
               '2013-01-09', '2013-01-10', '2013-01-11', '2013-01-12',
               '2013-01-13', '2013-01-14', '2013-01-15', '2013-01-16',
               '2013-01-17', '2013-01-18', '2013-01-19', '2013-01-20'],
              dtype='datetime64[ns]', freq='D')

In [25]:
dates_df = pd.DataFrame(np.random.randn(20, 4), index=dates, columns=list("ABCD"))

In [23]:
dates_df

Unnamed: 0,A,B,C,D
2013-01-01,-0.734146,-0.190876,-0.911595,2.421994
2013-01-02,-0.094941,0.126662,-1.368024,0.413055
2013-01-03,0.69864,-2.003477,-0.634654,-0.140863
2013-01-04,-0.560187,0.215529,0.143074,0.078621
2013-01-05,1.503066,-0.244649,-0.611939,-0.062458
2013-01-06,1.483063,1.330993,-0.500048,0.504062
2013-01-07,-0.62448,-0.099181,-0.593857,-0.341433
2013-01-08,1.148196,2.071855,2.496152,-0.273628
2013-01-09,1.101611,2.321021,0.607663,0.147131
2013-01-10,-0.400339,-0.116897,1.422844,0.752116


# 2.1.2 Viewing data

Here is how to view the top and bottom rows of the frame:

In [28]:
dates_df.head(7)  # if dataset is large... head will print first few (7 in our example) values

Unnamed: 0,A,B,C,D
2013-01-01,-0.349881,-1.309025,-0.009574,-0.944653
2013-01-02,-1.140735,2.176288,0.776429,0.670674
2013-01-03,0.377398,-0.526109,-1.673832,0.108239
2013-01-04,0.125482,-0.720712,0.409209,0.712693
2013-01-05,0.571175,1.046238,0.689739,1.107548
2013-01-06,-0.856939,-0.319108,1.607833,-1.837946
2013-01-07,0.649997,0.331403,-0.205147,-0.94146


In [34]:
dates_df.tail(7)

Unnamed: 0,A,B,C,D
2013-01-14,-1.229706,-0.07245,0.308444,-1.15143
2013-01-15,-0.401995,-0.477292,1.018245,-0.513386
2013-01-16,1.345684,-0.39993,0.524609,-0.055509
2013-01-17,1.061627,-0.692883,0.562767,0.290116
2013-01-18,-0.943117,-0.060696,-2.694237,0.891378
2013-01-19,-1.611548,-0.137289,-0.496073,1.060174
2013-01-20,-1.425491,0.635629,0.146169,0.151678


In [35]:
dates_df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [41]:
dates_df.dtypes  #prints data types of columns present in datadrame

A    float64
B    float64
C    float64
D    float64
dtype: object

In [42]:
#describe() shows a quick statistic summary of your data:

dates_df.describe()


Unnamed: 0,A,B,C,D
count,20.0,20.0,20.0,20.0
mean,-0.201697,-0.00189,0.163515,-0.075433
std,1.002946,0.988199,1.069592,0.94296
min,-1.611548,-1.339204,-2.694237,-1.837946
25%,-1.077032,-0.567803,-0.257996,-0.942258
50%,-0.375938,-0.104869,0.228859,0.026365
75%,0.701537,0.407459,0.711411,0.741639
max,1.345684,2.176288,1.888359,1.401868


In [46]:
#Transposing your data Transpose=exchangeing rows with columns

df_T=dates_df.T

df_T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06,2013-01-07,2013-01-08,2013-01-09,2013-01-10,2013-01-11,2013-01-12,2013-01-13,2013-01-14,2013-01-15,2013-01-16,2013-01-17,2013-01-18,2013-01-19,2013-01-20
A,-0.349881,-1.140735,0.377398,0.125482,0.571175,-0.856939,0.649997,-1.055797,0.856158,0.941766,-0.843286,-1.332849,1.228112,-1.229706,-0.401995,1.345684,1.061627,-0.943117,-1.611548,-1.425491
B,-1.309025,2.176288,-0.526109,-0.720712,1.046238,-0.319108,0.331403,-1.339204,0.728462,0.281997,2.136635,-1.316632,-0.003124,-0.07245,-0.477292,-0.39993,-0.692883,-0.060696,-0.137289,0.635629
C,-0.009574,0.776429,-1.673832,0.409209,0.689739,1.607833,-0.205147,1.42519,1.888359,-0.416541,-0.743712,0.003142,0.149273,0.308444,1.018245,0.524609,0.562767,-2.694237,-0.496073,0.146169
D,-0.944653,0.670674,0.108239,0.712693,1.107548,-1.837946,-0.94146,-1.42507,1.401868,-0.961146,0.828478,-0.338552,-0.56235,-1.15143,-0.513386,-0.055509,0.290116,0.891378,1.060174,0.151678


In [49]:
#Sorting by an axis:


dates_df.sort_index(axis=1,ascending=False) #axis=1 sort by columns in reverse order

Unnamed: 0,D,C,B,A
2013-01-01,-0.944653,-0.009574,-1.309025,-0.349881
2013-01-02,0.670674,0.776429,2.176288,-1.140735
2013-01-03,0.108239,-1.673832,-0.526109,0.377398
2013-01-04,0.712693,0.409209,-0.720712,0.125482
2013-01-05,1.107548,0.689739,1.046238,0.571175
2013-01-06,-1.837946,1.607833,-0.319108,-0.856939
2013-01-07,-0.94146,-0.205147,0.331403,0.649997
2013-01-08,-1.42507,1.42519,-1.339204,-1.055797
2013-01-09,1.401868,1.888359,0.728462,0.856158
2013-01-10,-0.961146,-0.416541,0.281997,0.941766


In [50]:
dates_df.sort_index(axis=0,ascending=False)  #axis 0=rowise sort

Unnamed: 0,A,B,C,D
2013-01-20,-1.425491,0.635629,0.146169,0.151678
2013-01-19,-1.611548,-0.137289,-0.496073,1.060174
2013-01-18,-0.943117,-0.060696,-2.694237,0.891378
2013-01-17,1.061627,-0.692883,0.562767,0.290116
2013-01-16,1.345684,-0.39993,0.524609,-0.055509
2013-01-15,-0.401995,-0.477292,1.018245,-0.513386
2013-01-14,-1.229706,-0.07245,0.308444,-1.15143
2013-01-13,1.228112,-0.003124,0.149273,-0.56235
2013-01-12,-1.332849,-1.316632,0.003142,-0.338552
2013-01-11,-0.843286,2.136635,-0.743712,0.828478


# Now onwards we will work on NVIDEA GPU SPECS DATASET

In [2]:

#lets read data from CSV File
gpu_df=pd.read_csv("gpu_specs_v6.csv")

NameError: name 'pd' is not defined

In [1]:
gpu_df["manufacturer"]

NameError: name 'gpu_df' is not defined

In [56]:
gpu_df.shape  #describing rows and columns

(2889, 16)

In [61]:
gpu_df.dtypes

manufacturer      object
productName       object
releaseYear      float64
memSize          float64
memBusWidth      float64
gpuClock           int64
memClock         float64
unifiedShader    float64
tmu                int64
rop                int64
pixelShader      float64
vertexShader     float64
igp               object
bus               object
memType           object
gpuChip           object
dtype: object

In [62]:
gpu_df.describe()

Unnamed: 0,releaseYear,memSize,memBusWidth,gpuClock,memClock,unifiedShader,tmu,rop,pixelShader,vertexShader
count,2845.0,2477.0,2477.0,2889.0,2477.0,2065.0,2889.0,2889.0,824.0,824.0
mean,2010.691388,3.113803,274.874445,661.126687,868.578119,1032.93753,47.429214,18.750087,6.739078,2.622573
std,6.193125,7.175399,653.163896,374.48145,509.987396,1662.834618,73.014849,25.067896,8.091586,2.579388
min,1986.0,3.2e-05,32.0,10.0,5.0,8.0,0.0,0.0,0.0,0.0
25%,2006.0,0.256,128.0,400.0,400.0,144.0,8.0,4.0,2.0,0.0
50%,2011.0,1.024,128.0,600.0,837.0,384.0,20.0,8.0,4.0,2.0
75%,2015.0,3.0,256.0,875.0,1250.0,1280.0,56.0,24.0,8.0,4.0
max,2023.0,128.0,8192.0,2331.0,2257.0,17408.0,880.0,256.0,48.0,24.0


In [64]:
gpu_df.sort_values(by="gpuClock")

Unnamed: 0,manufacturer,productName,releaseYear,memSize,memBusWidth,gpuClock,memClock,unifiedShader,tmu,rop,pixelShader,vertexShader,igp,bus,memType,gpuChip
2829,ATI,Graphics Ultra,1992.0,0.001000,32.0,10,10.0,,0,1,1.0,0.0,No,PCI,VRAM,Mach8
2842,ATI,VGA Improved Performance,1987.0,0.000256,32.0,10,10.0,,0,1,1.0,0.0,No,PCI,DRAM,16899-0
2841,ATI,Graphics Solution Plus,1987.0,0.000064,32.0,10,5.0,,0,0,0.0,0.0,No,PCI,DRAM,CW16800-B
2840,ATI,EGA Wonder 800,1987.0,0.000256,32.0,10,8.0,,0,1,1.0,0.0,No,PCI,DRAM,16899-0
2838,ATI,EGA Wonder 480,1988.0,0.000256,32.0,10,8.0,,0,1,1.0,0.0,No,PCI,DRAM,16899-0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37,AMD,Radeon RX 6500 XT,2022.0,4.000000,64.0,2310,2248.0,1024.0,64,32,,,No,PCIe 4.0 x4,GDDR6,Navi 24
107,AMD,Radeon RX 6700 XT,2021.0,12.000000,192.0,2321,2000.0,2560.0,160,64,,,No,PCIe 4.0 x16,GDDR6,Navi 22
46,AMD,Radeon RX 6850M XT,2022.0,12.000000,192.0,2321,2000.0,2560.0,160,64,,,No,PCIe 4.0 x16,GDDR6,Navi 22
34,AMD,Radeon Pro W6400,2022.0,4.000000,64.0,2331,1750.0,768.0,48,32,,,No,PCIe 4.0 x4,GDDR6,Navi 24


In [65]:
gpu_df.sort_values(by="gpuClock",ascending=False)

Unnamed: 0,manufacturer,productName,releaseYear,memSize,memBusWidth,gpuClock,memClock,unifiedShader,tmu,rop,pixelShader,vertexShader,igp,bus,memType,gpuChip
34,AMD,Radeon Pro W6400,2022.0,4.000,64.0,2331,1750.0,768.0,48,32,,,No,PCIe 4.0 x4,GDDR6,Navi 24
97,AMD,Radeon Pro W6600,2021.0,8.000,128.0,2331,1750.0,1792.0,112,64,,,No,PCIe 4.0 x16,GDDR6,Navi 23
46,AMD,Radeon RX 6850M XT,2022.0,12.000,192.0,2321,2000.0,2560.0,160,64,,,No,PCIe 4.0 x16,GDDR6,Navi 22
107,AMD,Radeon RX 6700 XT,2021.0,12.000,192.0,2321,2000.0,2560.0,160,64,,,No,PCIe 4.0 x16,GDDR6,Navi 22
37,AMD,Radeon RX 6500 XT,2022.0,4.000,64.0,2310,2248.0,1024.0,64,32,,,No,PCIe 4.0 x4,GDDR6,Navi 24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2831,ATI,Graphics Ultra Pro ISA,1992.0,0.002,64.0,10,10.0,,0,1,1.0,0.0,No,PCI,DRAM,Mach32
2830,ATI,Graphics Ultra Pro ISA,1992.0,0.002,64.0,10,10.0,,0,1,1.0,0.0,No,PCI,DRAM,Mach32
2829,ATI,Graphics Ultra,1992.0,0.001,32.0,10,10.0,,0,1,1.0,0.0,No,PCI,VRAM,Mach8
2828,ATI,8514-Ultra,1992.0,0.001,32.0,10,10.0,,0,1,1.0,0.0,No,PCI,VRAM,Mach8


In [84]:
#Selecting a single column, which yields a Series, equivalent to df.A:
#.unique() eliminates duplicate values

a=gpu_df["productName"].unique()
b=gpu_df["releaseYear"].unique()



In [86]:
b,a

(array([2023., 2022., 2021., 2020., 2019., 2018., 2017., 2016., 2015.,
        2014., 2013., 2012., 2011., 2010., 2009., 2008., 2007., 2006.,
        2005., 2004., 2003., 2002., 2001., 2000., 1999., 1998., 1997.,
        1996., 1995., 1994., 1993., 1992., 1990., 1988., 1987., 1986.,
          nan]),
 array(['GeForce RTX 4050', 'Arc A350M', 'Arc A370M', ..., 'Voodoo5 6000',
        'Xe DG1', 'Xe DG1-SDV'], dtype=object))

In [88]:
##Selecting via [], which slices the rows:

gpu_df[10:23]  #this will select values from index 10 to index 22... 23 is excluded




Unnamed: 0,manufacturer,productName,releaseYear,memSize,memBusWidth,gpuClock,memClock,unifiedShader,tmu,rop,pixelShader,vertexShader,igp,bus,memType,gpuChip
10,NVIDIA,GeForce MX550,2022.0,2.0,64.0,1065,1500.0,1024.0,32,16,,,No,PCIe 4.0 x8,GDDR6,TU117
11,NVIDIA,GeForce MX570,2022.0,4.0,64.0,1087,1500.0,2048.0,64,40,,,No,PCIe 4.0 x8,GDDR6,GA107S
12,NVIDIA,GeForce RTX 3050 4 GB,2022.0,4.0,128.0,1545,1750.0,2304.0,72,32,,,No,PCIe 4.0 x8,GDDR6,GA107
13,NVIDIA,GeForce RTX 3050 8 GB,2022.0,8.0,128.0,1552,1750.0,2560.0,80,32,,,No,PCIe 4.0 x8,GDDR6,GA106
14,NVIDIA,GeForce RTX 3050 8 GB GA107,2022.0,8.0,128.0,1552,1750.0,2560.0,80,32,,,No,PCIe 4.0 x8,GDDR6,GA107
15,NVIDIA,GeForce RTX 3050 OEM,2022.0,8.0,128.0,1515,1750.0,2560.0,80,32,,,No,PCIe 4.0 x8,GDDR6,GA106
16,NVIDIA,GeForce RTX 3060 Ti GA103,2022.0,8.0,256.0,1410,1750.0,4864.0,152,80,,,No,PCIe 4.0 x16,GDDR6,GA103S
17,NVIDIA,GeForce RTX 3070 Ti Mobile,2022.0,8.0,256.0,915,1750.0,5632.0,176,80,,,No,PCIe 4.0 x16,GDDR6,GA104
18,NVIDIA,GeForce RTX 3080 12 GB,2022.0,12.0,384.0,1260,1188.0,8960.0,280,96,,,No,PCIe 4.0 x16,GDDR6X,GA102
19,NVIDIA,GeForce RTX 3080 Ti 20 GB,2022.0,20.0,320.0,1335,1188.0,10240.0,320,112,,,No,PCIe 4.0 x16,GDDR6X,GA102


In [92]:
#Selection by label
#selecting cross section ... i.e selecting value at particular index


gpu_df.iloc[6]

manufacturer            Intel
productName          Arc A770
releaseYear            2022.0
memSize                  16.0
memBusWidth             256.0
gpuClock                  300
memClock               1500.0
unifiedShader          4096.0
tmu                       256
rop                       128
pixelShader               NaN
vertexShader              NaN
igp                        No
bus              PCIe 4.0 x16
memType                 GDDR6
gpuChip               DG2-512
Name: 6, dtype: object

In [98]:
#Selecting on a multi-axis by label:

gpu_df.loc[10:20,["gpuClock","memClock"]]

Unnamed: 0,gpuClock,memClock
10,1065,1500.0
11,1087,1500.0
12,1545,1750.0
13,1552,1750.0
14,1552,1750.0
15,1515,1750.0
16,1410,1750.0
17,915,1750.0
18,1260,1188.0
19,1335,1188.0


In [102]:
#For getting a scalar value:


gpu_df.loc[19,"gpuClock"]
#we are getting gpuClock value present at index 19


1335

In [107]:
# selection By integer slices, acting similar to NumPy/Python:

gpu_df.iloc[10:20, 3:9] #this will select rows from 10-20, and columns from 3rd to 8th...


Unnamed: 0,memSize,memBusWidth,gpuClock,memClock,unifiedShader,tmu
10,2.0,64.0,1065,1500.0,1024.0,32
11,4.0,64.0,1087,1500.0,2048.0,64
12,4.0,128.0,1545,1750.0,2304.0,72
13,8.0,128.0,1552,1750.0,2560.0,80
14,8.0,128.0,1552,1750.0,2560.0,80
15,8.0,128.0,1515,1750.0,2560.0,80
16,8.0,256.0,1410,1750.0,4864.0,152
17,8.0,256.0,915,1750.0,5632.0,176
18,12.0,384.0,1260,1188.0,8960.0,280
19,20.0,320.0,1335,1188.0,10240.0,320


In [110]:
#Boolean indexing
#it will print/show those rows where memSize is greater than 10
gpu_df[gpu_df["memSize"]>10]  

Unnamed: 0,manufacturer,productName,releaseYear,memSize,memBusWidth,gpuClock,memClock,unifiedShader,tmu,rop,pixelShader,vertexShader,igp,bus,memType,gpuChip
5,Intel,Arc A730M,2022.0,12.0,192.0,300,1500.0,3072.0,192,96,,,No,PCIe 4.0 x16,GDDR6,DG2-512
6,Intel,Arc A770,2022.0,16.0,256.0,300,1500.0,4096.0,256,128,,,No,PCIe 4.0 x16,GDDR6,DG2-512
7,Intel,Arc A770M,2022.0,16.0,256.0,300,1500.0,4096.0,256,128,,,No,PCIe 4.0 x16,GDDR6,DG2-512
8,Intel,Arc A780,2022.0,16.0,256.0,300,1093.0,4096.0,256,128,,,No,PCIe 4.0 x16,GDDR6X,DG2-512
9,Intel,Arctic Sound-M,2022.0,16.0,4096.0,900,1200.0,8192.0,256,128,,,No,PCIe 4.0 x16,HBM2e,Arctic Sound
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1186,Intel,Xeon Phi 7120P,2013.0,16.0,512.0,1238,1375.0,976.0,32,0,,,No,PCIe 3.0 x16,GDDR5,Knights Corner
1187,Intel,Xeon Phi 7120X,2013.0,16.0,512.0,1238,1375.0,976.0,32,0,,,No,PCIe 3.0 x16,GDDR5,Knights Corner
2853,NVIDIA,GeForce RTX 3070 Ti 16 GB,,16.0,256.0,1575,1188.0,6144.0,192,96,,,No,PCIe 4.0 x16,GDDR6X,GA104
2866,AMD,Radeon Pro V7350X2,,16.0,256.0,1188,1750.0,2304.0,144,32,,,No,PCIe 3.0 x16,GDDR5,Ellesmere


# SETTING/APPENDING

In [113]:
#setting up a new column

s=pd.Series(np.random.rand(2889))
s


0       0.301238
1       0.854746
2       0.646386
3       0.366595
4       0.005925
          ...   
2884    0.157377
2885    0.139304
2886    0.677035
2887    0.288926
2888    0.091029
Length: 2889, dtype: float64

0       0.162932
1       0.258576
2       0.006955
3       0.462313
4       0.050218
          ...   
2884    0.438216
2885    0.436368
2886    0.502154
2887    0.217004
2888    0.820694
Length: 2889, dtype: float64

In [116]:
gpu_df["new_col"]=s  #we appended new_col at the rightmost end of our dataFrame

In [118]:
gpu_df.head(10)

Unnamed: 0,manufacturer,productName,releaseYear,memSize,memBusWidth,gpuClock,memClock,unifiedShader,tmu,rop,pixelShader,vertexShader,igp,bus,memType,gpuChip,new_col
0,NVIDIA,GeForce RTX 4050,2023.0,8.0,128.0,1925,2250.0,3840.0,120,48,,,No,PCIe 4.0 x16,GDDR6,AD106,0.301238
1,Intel,Arc A350M,2022.0,4.0,64.0,300,1500.0,768.0,48,24,,,No,PCIe 4.0 x8,GDDR6,DG2-128,0.854746
2,Intel,Arc A370M,2022.0,4.0,64.0,300,1500.0,1024.0,64,32,,,No,PCIe 4.0 x8,GDDR6,DG2-128,0.646386
3,Intel,Arc A380,2022.0,4.0,64.0,300,1500.0,1024.0,64,32,,,No,PCIe 4.0 x8,GDDR6,DG2-128,0.366595
4,Intel,Arc A550M,2022.0,8.0,128.0,300,1500.0,2048.0,128,64,,,No,PCIe 4.0 x16,GDDR6,DG2-512,0.005925
5,Intel,Arc A730M,2022.0,12.0,192.0,300,1500.0,3072.0,192,96,,,No,PCIe 4.0 x16,GDDR6,DG2-512,0.648927
6,Intel,Arc A770,2022.0,16.0,256.0,300,1500.0,4096.0,256,128,,,No,PCIe 4.0 x16,GDDR6,DG2-512,0.407142
7,Intel,Arc A770M,2022.0,16.0,256.0,300,1500.0,4096.0,256,128,,,No,PCIe 4.0 x16,GDDR6,DG2-512,0.917086
8,Intel,Arc A780,2022.0,16.0,256.0,300,1093.0,4096.0,256,128,,,No,PCIe 4.0 x16,GDDR6X,DG2-512,0.46463
9,Intel,Arctic Sound-M,2022.0,16.0,4096.0,900,1200.0,8192.0,256,128,,,No,PCIe 4.0 x16,HBM2e,Arctic Sound,0.957726


# handeling Missing data

In [125]:
df2=gpu_df.iloc[: , :7]

In [126]:
df2

Unnamed: 0,manufacturer,productName,releaseYear,memSize,memBusWidth,gpuClock,memClock
0,NVIDIA,GeForce RTX 4050,2023.0,8.000,128.0,1925,2250.0
1,Intel,Arc A350M,2022.0,4.000,64.0,300,1500.0
2,Intel,Arc A370M,2022.0,4.000,64.0,300,1500.0
3,Intel,Arc A380,2022.0,4.000,64.0,300,1500.0
4,Intel,Arc A550M,2022.0,8.000,128.0,300,1500.0
...,...,...,...,...,...,...,...
2884,3dfx,Voodoo5 5000 AGP,,0.016,128.0,166,166.0
2885,3dfx,Voodoo5 5000 PCI,,0.016,128.0,166,166.0
2886,3dfx,Voodoo5 6000,,0.032,128.0,166,166.0
2887,Intel,Xe DG1,,4.000,128.0,900,2133.0


In [131]:
df2.dropna()  #as we can see here it dropped rows containing NaN values.. original rows=2889, after dropna= 2433

Unnamed: 0,manufacturer,productName,releaseYear,memSize,memBusWidth,gpuClock,memClock
0,NVIDIA,GeForce RTX 4050,2023.0,8.000000,128.0,1925,2250.0
1,Intel,Arc A350M,2022.0,4.000000,64.0,300,1500.0
2,Intel,Arc A370M,2022.0,4.000000,64.0,300,1500.0
3,Intel,Arc A380,2022.0,4.000000,64.0,300,1500.0
4,Intel,Arc A550M,2022.0,8.000000,128.0,300,1500.0
...,...,...,...,...,...,...,...
2840,ATI,EGA Wonder 800,1987.0,0.000256,32.0,10,8.0
2841,ATI,Graphics Solution Plus,1987.0,0.000064,32.0,10,5.0
2842,ATI,VGA Improved Performance,1987.0,0.000256,32.0,10,10.0
2843,ATI,Color Emulation Card,1986.0,0.000032,32.0,10,5.0


In [133]:
#filling Nan values with some assigned value


df2.fillna(value=7)

Unnamed: 0,manufacturer,productName,releaseYear,memSize,memBusWidth,gpuClock,memClock
0,NVIDIA,GeForce RTX 4050,2023.0,8.000,128.0,1925,2250.0
1,Intel,Arc A350M,2022.0,4.000,64.0,300,1500.0
2,Intel,Arc A370M,2022.0,4.000,64.0,300,1500.0
3,Intel,Arc A380,2022.0,4.000,64.0,300,1500.0
4,Intel,Arc A550M,2022.0,8.000,128.0,300,1500.0
...,...,...,...,...,...,...,...
2884,3dfx,Voodoo5 5000 AGP,7.0,0.016,128.0,166,166.0
2885,3dfx,Voodoo5 5000 PCI,7.0,0.016,128.0,166,166.0
2886,3dfx,Voodoo5 6000,7.0,0.032,128.0,166,166.0
2887,Intel,Xe DG1,7.0,4.000,128.0,900,2133.0


In [136]:
#getting masked values  true/false 
#true if value is Nan    false otherwise
pd.isna(df2)

Unnamed: 0,manufacturer,productName,releaseYear,memSize,memBusWidth,gpuClock,memClock
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
2884,False,False,True,False,False,False,False
2885,False,False,True,False,False,False,False
2886,False,False,True,False,False,False,False
2887,False,False,True,False,False,False,False


#  Operations

In [141]:
df3=gpu_df.iloc[: , 2:7]


#finding mean of columns
df3.mean()

releaseYear    2010.691388
memSize           3.113803
memBusWidth     274.874445
gpuClock        661.126687
memClock        868.578119
dtype: float64

In [143]:
df3

Unnamed: 0,releaseYear,memSize,memBusWidth,gpuClock,memClock
0,2023.0,8.000,128.0,1925,2250.0
1,2022.0,4.000,64.0,300,1500.0
2,2022.0,4.000,64.0,300,1500.0
3,2022.0,4.000,64.0,300,1500.0
4,2022.0,8.000,128.0,300,1500.0
...,...,...,...,...,...
2884,,0.016,128.0,166,166.0
2885,,0.016,128.0,166,166.0
2886,,0.032,128.0,166,166.0
2887,,4.000,128.0,900,2133.0


# Apply
Applying functions to the data:

In [157]:
df3.apply(np.cumsum) #cumsum ==cumulative sum

Unnamed: 0,releaseYear,memSize,memBusWidth,gpuClock,memClock
0,2023.0,8.000000,128.0,1925,2250.0
1,4045.0,12.000000,192.0,2225,3750.0
2,6067.0,16.000000,256.0,2525,5250.0
3,8089.0,20.000000,320.0,2825,6750.0
4,10111.0,28.000000,448.0,3125,8250.0
...,...,...,...,...,...
2884,,7700.841952,680352.0,1907863,2146870.0
2885,,7700.857952,680480.0,1908029,2147036.0
2886,,7700.889952,680608.0,1908195,2147202.0
2887,,7704.889952,680736.0,1909095,2149335.0


In [155]:
df3.cummax().head(30)

Unnamed: 0,releaseYear,memSize,memBusWidth,gpuClock,memClock
0,2023.0,8.0,128.0,1925,2250.0
1,2023.0,8.0,128.0,1925,2250.0
2,2023.0,8.0,128.0,1925,2250.0
3,2023.0,8.0,128.0,1925,2250.0
4,2023.0,8.0,128.0,1925,2250.0
5,2023.0,12.0,192.0,1925,2250.0
6,2023.0,16.0,256.0,1925,2250.0
7,2023.0,16.0,256.0,1925,2250.0
8,2023.0,16.0,256.0,1925,2250.0
9,2023.0,16.0,4096.0,1925,2250.0


In [156]:
df3

Unnamed: 0,releaseYear,memSize,memBusWidth,gpuClock,memClock
0,2023.0,8.000,128.0,1925,2250.0
1,2022.0,4.000,64.0,300,1500.0
2,2022.0,4.000,64.0,300,1500.0
3,2022.0,4.000,64.0,300,1500.0
4,2022.0,8.000,128.0,300,1500.0
...,...,...,...,...,...
2884,,0.016,128.0,166,166.0
2885,,0.016,128.0,166,166.0
2886,,0.032,128.0,166,166.0
2887,,4.000,128.0,900,2133.0


In [160]:
#it will show max-min value for each column

df3.apply(lambda x: x.max() - x.min())

releaseYear      37.000000
memSize         127.999968
memBusWidth    8160.000000
gpuClock       2321.000000
memClock       2252.000000
dtype: float64

In [168]:
#counts or returns the number of time particular value has been occured

df3["releaseYear"].value_counts()

2013.0    225
2008.0    174
2012.0    173
2011.0    170
2015.0    165
2007.0    156
2014.0    149
2010.0    148
2004.0    132
2006.0    129
2003.0    120
2017.0    110
2016.0    109
2009.0    106
2019.0    105
2005.0     97
2018.0     93
2021.0     88
2020.0     86
2002.0     70
2022.0     57
2001.0     41
1999.0     41
2000.0     32
1998.0     21
1997.0      9
1995.0      9
1996.0      8
1992.0      8
1987.0      3
1993.0      3
1988.0      2
1986.0      2
1990.0      2
1994.0      1
2023.0      1
Name: releaseYear, dtype: int64

In [170]:
df3.groupby("releaseYear").sum()

Unnamed: 0_level_0,memSize,memBusWidth,gpuClock,memClock
releaseYear,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1986.0,9.6e-05,64.0,20,10.0
1987.0,0.000576,96.0,30,23.0
1988.0,0.000512,64.0,35,16.0
1990.0,0.000768,64.0,20,20.0
1992.0,0.01,384.0,80,80.0
1993.0,0.003,192.0,86,103.0
1994.0,0.001,32.0,53,33.0
1995.0,0.016,576.0,452,416.0
1996.0,0.023,576.0,458,572.0
1997.0,0.047,672.0,703,733.0


In [178]:
#this works like sql query, first it will groupe/devide by 1st value, then group by 2nd.. and so on

df3.groupby(["releaseYear","gpuClock"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,memSize,memBusWidth,memClock
releaseYear,gpuClock,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1986.0,10,0.000096,64.0,10.0
1987.0,10,0.000576,96.0,23.0
1988.0,10,0.000256,32.0,8.0
1988.0,25,0.000256,32.0,8.0
1990.0,10,0.000768,64.0,20.0
...,...,...,...,...
2022.0,2150,12.000000,192.0,2250.0
2022.0,2310,4.000000,64.0,2248.0
2022.0,2321,12.000000,192.0,2000.0
2022.0,2331,4.000000,64.0,1750.0


# Time series
pandas has simple, powerful, and efficient functionality for performing resampling operations during frequency con-
version (e.g., converting secondly data into 5-minutely data). This is extremely common in, but not limited to, financial
applications.

In [185]:
rng = pd.date_range("1/1/2012", periods=100, freq="S")
rng

DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:00:01',
               '2012-01-01 00:00:02', '2012-01-01 00:00:03',
               '2012-01-01 00:00:04', '2012-01-01 00:00:05',
               '2012-01-01 00:00:06', '2012-01-01 00:00:07',
               '2012-01-01 00:00:08', '2012-01-01 00:00:09',
               '2012-01-01 00:00:10', '2012-01-01 00:00:11',
               '2012-01-01 00:00:12', '2012-01-01 00:00:13',
               '2012-01-01 00:00:14', '2012-01-01 00:00:15',
               '2012-01-01 00:00:16', '2012-01-01 00:00:17',
               '2012-01-01 00:00:18', '2012-01-01 00:00:19',
               '2012-01-01 00:00:20', '2012-01-01 00:00:21',
               '2012-01-01 00:00:22', '2012-01-01 00:00:23',
               '2012-01-01 00:00:24', '2012-01-01 00:00:25',
               '2012-01-01 00:00:26', '2012-01-01 00:00:27',
               '2012-01-01 00:00:28', '2012-01-01 00:00:29',
               '2012-01-01 00:00:30', '2012-01-01 00:00:31',
               '2012-01-

In [188]:
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts


2012-01-01 00:00:00    269
2012-01-01 00:00:01      7
2012-01-01 00:00:02    399
2012-01-01 00:00:03    205
2012-01-01 00:00:04    322
                      ... 
2012-01-01 00:01:35    156
2012-01-01 00:01:36     35
2012-01-01 00:01:37     11
2012-01-01 00:01:38    251
2012-01-01 00:01:39    438
Freq: S, Length: 100, dtype: int64

In [194]:
ts.resample("5Min").sum()

2012-01-01    25263
Freq: 5T, dtype: int64

In [195]:
rng = pd.date_range("3/6/2012 00:00", periods=100, freq="D")

In [196]:
rng

DatetimeIndex(['2012-03-06', '2012-03-07', '2012-03-08', '2012-03-09',
               '2012-03-10', '2012-03-11', '2012-03-12', '2012-03-13',
               '2012-03-14', '2012-03-15', '2012-03-16', '2012-03-17',
               '2012-03-18', '2012-03-19', '2012-03-20', '2012-03-21',
               '2012-03-22', '2012-03-23', '2012-03-24', '2012-03-25',
               '2012-03-26', '2012-03-27', '2012-03-28', '2012-03-29',
               '2012-03-30', '2012-03-31', '2012-04-01', '2012-04-02',
               '2012-04-03', '2012-04-04', '2012-04-05', '2012-04-06',
               '2012-04-07', '2012-04-08', '2012-04-09', '2012-04-10',
               '2012-04-11', '2012-04-12', '2012-04-13', '2012-04-14',
               '2012-04-15', '2012-04-16', '2012-04-17', '2012-04-18',
               '2012-04-19', '2012-04-20', '2012-04-21', '2012-04-22',
               '2012-04-23', '2012-04-24', '2012-04-25', '2012-04-26',
               '2012-04-27', '2012-04-28', '2012-04-29', '2012-04-30',
      

In [197]:
ts = pd.Series(np.random.randn(len(rng)), rng)

In [198]:
ts

2012-03-06   -2.315210
2012-03-07    0.645906
2012-03-08    0.505145
2012-03-09   -1.136596
2012-03-10    0.875005
                ...   
2012-06-09    0.892278
2012-06-10   -0.272271
2012-06-11   -0.353001
2012-06-12   -0.868946
2012-06-13   -1.727349
Freq: D, Length: 100, dtype: float64

In [201]:
ts_utc = ts.tz_localize("UTC") #converting to UTC Timezone

In [202]:
ts_utc

2012-03-06 00:00:00+00:00   -2.315210
2012-03-07 00:00:00+00:00    0.645906
2012-03-08 00:00:00+00:00    0.505145
2012-03-09 00:00:00+00:00   -1.136596
2012-03-10 00:00:00+00:00    0.875005
                               ...   
2012-06-09 00:00:00+00:00    0.892278
2012-06-10 00:00:00+00:00   -0.272271
2012-06-11 00:00:00+00:00   -0.353001
2012-06-12 00:00:00+00:00   -0.868946
2012-06-13 00:00:00+00:00   -1.727349
Freq: D, Length: 100, dtype: float64

In [203]:
ts_utc.tz_convert("US/Eastern")

2012-03-05 19:00:00-05:00   -2.315210
2012-03-06 19:00:00-05:00    0.645906
2012-03-07 19:00:00-05:00    0.505145
2012-03-08 19:00:00-05:00   -1.136596
2012-03-09 19:00:00-05:00    0.875005
                               ...   
2012-06-08 20:00:00-04:00    0.892278
2012-06-09 20:00:00-04:00   -0.272271
2012-06-10 20:00:00-04:00   -0.353001
2012-06-11 20:00:00-04:00   -0.868946
2012-06-12 20:00:00-04:00   -1.727349
Freq: D, Length: 100, dtype: float64

# Categorical DATA
pandas can include categorical data in a DataFrame.

In [227]:
df = pd.DataFrame(
{"id": [1, 2, 3, 4, 5, 6], "raw_grade": ["a", "b", "b", "a", "a", "e"]}
)

df["raw_grade"]

0    a
1    b
2    b
3    a
4    a
5    e
Name: raw_grade, dtype: object

In [228]:
#Converting the raw grades to a categorical data type:

df["grade"] = df["raw_grade"].astype("category")

In [229]:
df["grade"]

0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): ['a', 'b', 'e']

In [230]:
#Rename the categories to more meaningful names (assigning to Series.cat.categories() is in place!):

In [232]:
df["grade"].cat.categories = ["very good", "good", "very bad"]

df["grade"]

0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (3, object): ['very good', 'good', 'very bad']

In [233]:
df["grade"] = df["grade"].cat.set_categories(

["very bad", "bad", "medium", "good", "very good"]
)

In [234]:
df["grade"]

0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (5, object): ['very bad', 'bad', 'medium', 'good', 'very good']

In [237]:
df.groupby(["grade"]).size()

grade
very bad     1
bad          0
medium       0
good         2
very good    3
dtype: int64

# Getting data in/out

1.writing to csv

In [238]:
df.to_csv("1.csv")

In [239]:
pd.read_csv("1.csv")

Unnamed: 0.1,Unnamed: 0,id,raw_grade,grade
0,0,1,a,very good
1,1,2,b,good
2,2,3,b,good
3,3,4,a,very good
4,4,5,a,very good
5,5,6,e,very bad


2.HDF5

Reading and writing to HDFStores.

Writing to a HDF5 Store:

    
    


In [241]:
df.to_hdf("2.h5", "df",format="table")


In [242]:
#Reading from a HDF5 Store:

pd.read_hdf("2.h5", "df")

Unnamed: 0,id,raw_grade,grade
0,1,a,very good
1,2,b,good
2,3,b,good
3,4,a,very good
4,5,a,very good
5,6,e,very bad


3.EXCEL 

In [243]:
df3.to_excel("3.xlsx", sheet_name="Sheet1")

In [245]:
pd.read_excel("3.xlsx","Sheet1",index_col=None, na_values=["NA"])

Unnamed: 0.1,Unnamed: 0,releaseYear,memSize,memBusWidth,gpuClock,memClock
0,0,2023.0,8.000,128.0,1925,2250.0
1,1,2022.0,4.000,64.0,300,1500.0
2,2,2022.0,4.000,64.0,300,1500.0
3,3,2022.0,4.000,64.0,300,1500.0
4,4,2022.0,8.000,128.0,300,1500.0
...,...,...,...,...,...,...
2884,2884,,0.016,128.0,166,166.0
2885,2885,,0.016,128.0,166,166.0
2886,2886,,0.032,128.0,166,166.0
2887,2887,,4.000,128.0,900,2133.0
