In [27]:
import numpy as np
import pandas as pd

data=np.array([["John",20,"Male"],["",22,"Female"],["Henry",25,"Male"],["Smith",30,"Male"],["Susan",27,"Female"],["Minendra",19,"Male"]])
print(data)

df=pd.DataFrame(data,columns=['Name','Age','Gender'])

print(df)

[['John' '20' 'Male']
 ['' '22' 'Female']
 ['Henry' '25' 'Male']
 ['Smith' '30' 'Male']
 ['Susan' '27' 'Female']
 ['Minendra' '19' 'Male']]
       Name Age  Gender
0      John  20    Male
1            22  Female
2     Henry  25    Male
3     Smith  30    Male
4     Susan  27  Female
5  Minendra  19    Male


#### Pandas DataFrame comes is a powerful tool that allows us to store and manipulate data in a structured way, 

In [28]:
df.index

RangeIndex(start=0, stop=6, step=1)

In [29]:
print(df.head(4))

    Name Age  Gender
0   John  20    Male
1         22  Female
2  Henry  25    Male
3  Smith  30    Male


In [30]:
print(df.head())

    Name Age  Gender
0   John  20    Male
1         22  Female
2  Henry  25    Male
3  Smith  30    Male
4  Susan  27  Female


In [31]:
print(df.tail(2))

       Name Age  Gender
4     Susan  27  Female
5  Minendra  19    Male


In [33]:
print(df.tail())

       Name Age  Gender
1            22  Female
2     Henry  25    Male
3     Smith  30    Male
4     Susan  27  Female
5  Minendra  19    Male


## Difference Between Series and DataFrame in Pandas

In **Pandas**, both `Series` and `DataFrame` are data structures used for data manipulation.

### Series
- A **Series** is a one-dimensional labeled array capable of holding any data type (integers, strings, floating point numbers, Python objects, etc.).
- It is similar to a column in a spreadsheet or a database table.

### DataFrame
- A **DataFrame** is a two-dimensional labeled data structure with columns of potentially different types.
- It is similar to a table in a database or an Excel spreadsheet.

In [35]:
import pandas as pd 
series = pd.Series([10,20,30,40],name="Numbers")
print(series)

print("\nType of Series: ",type(series))

0    10
1    20
2    30
3    40
Name: Numbers, dtype: int64

Type of Series:  <class 'pandas.core.series.Series'>


In [56]:
#Creating data frame
data={
    "Name":["Alice","Bob","Charlie","David"],
    "Age":[25,30,35,40]
}
df=pd.DataFrame(data)
print(df)

print("\nType of Dataframe: ",type(df))

      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35
3    David   40

Type of Dataframe:  <class 'pandas.core.frame.DataFrame'>


In [37]:
df.index

RangeIndex(start=0, stop=6, step=1)

In [44]:
print(df.index.values)

[0 1 2 3]


In [57]:
df=df.set_index('Name')
print(df)

         Age
Name        
Alice     25
Bob       30
Charlie   35
David     40


In [46]:
df.index.values

array(['Alice', 'Bob', 'Charlie', 'David'], dtype=object)

In [47]:
df.index.value_counts

<bound method IndexOpsMixin.value_counts of Index(['Alice', 'Bob', 'Charlie', 'David'], dtype='object', name='Name')>

In [None]:
import pandas as pd 
print("Before reset_index(): ")
print(df)
df.reset_index(inplace=True)

print("\nAfter reset_index() with inplace=True: ")
print(df)

Before reset_index(): 
         Age
Name        
Alice     25
Bob       30
Charlie   35
David     40

After reset_index() with inplace=True: 
      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35
3    David   40


In [62]:
#Set custom index 
df.index=['a','b','c','d']
print(df)

      Name  Age
a    Alice   25
b      Bob   30
c  Charlie   35
d    David   40


In [63]:
df.reset_index(drop=True,inplace=True)
print(df)

      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35
3    David   40


In [64]:
#get index as a list 
index_lst=df.index.tolist()
print(index_lst)

[0, 1, 2, 3]


In [65]:
df["Name"]

0      Alice
1        Bob
2    Charlie
3      David
Name: Name, dtype: object

In [66]:
df["Age"]

0    25
1    30
2    35
3    40
Name: Age, dtype: int64

In [70]:
type(df[["Name"]])

pandas.core.frame.DataFrame

In [68]:
type(df["Name"])

pandas.core.series.Series

In [71]:
df[["Name"]]

Unnamed: 0,Name
0,Alice
1,Bob
2,Charlie
3,David


In [72]:
row=df.loc[1]
print(row)

Name    Bob
Age      30
Name: 1, dtype: object


In [74]:
for i in range(3):
    row=df.loc[i]
    print("\n",row)


 Name    Alice
Age        25
Name: 0, dtype: object

 Name    Bob
Age      30
Name: 1, dtype: object

 Name    Charlie
Age          35
Name: 2, dtype: object


In [76]:
rows=df.loc[::1,['Name']]
print(rows)

      Name
0    Alice
1      Bob
2  Charlie
3    David


In [79]:
rows=df.loc[1:2,['Name','Age']]
print(rows)

      Name  Age
1      Bob   30
2  Charlie   35


In [81]:
data=np.array([["John",20,"Male"],["Kim",22,"Female"],["Henry",25,"Male"],["Smith",30,"Male"],["Austin",27,"Male"]])
df=pd.DataFrame(data,columns=['Name','Age','Gender'],index=['A','B','C','D','E'])
print(df)

     Name Age  Gender
A    John  20    Male
B     Kim  22  Female
C   Henry  25    Male
D   Smith  30    Male
E  Austin  27    Male


In [82]:
df.loc["A":"D":2,["Name","Age"]]

Unnamed: 0,Name,Age
A,John,20
C,Henry,25


In [83]:
df.loc[["A","B"]]

Unnamed: 0,Name,Age,Gender
A,John,20,Male
B,Kim,22,Female


In [86]:
#Filtering dataframes
#use | for or conditional

print(df[(df["Age"].astype(int)>22) & (df["Gender"].astype(str)=="Male")])
print(df[(df["Age"].astype(int)>22) | (df["Gender"].astype(str)=="Male")])

     Name Age Gender
C   Henry  25   Male
D   Smith  30   Male
E  Austin  27   Male
     Name Age Gender
A    John  20   Male
C   Henry  25   Male
D   Smith  30   Male
E  Austin  27   Male


In [88]:
#Filtering and setting value 
df.loc[(df["Age"].astype(int)>22)&(df["Gender"].astype(str)=="Male"),"Age"]=35
print(df[df["Age"].astype(int)>22])

     Name Age Gender
C   Henry  35   Male
D   Smith  35   Male
E  Austin  35   Male


In [89]:
dicts={
    'items':["Laptop","Mouse","Book","Pencil"],
    'quantity':[5,10,15,20],
    'unit price':[700,20,10,2]
}
df=pd.DataFrame(dicts)
print(df)

    items  quantity  unit price
0  Laptop         5         700
1   Mouse        10          20
2    Book        15          10
3  Pencil        20           2


In [108]:
dicts2={
    #'items':["Laptop","Mouse","Book","Pencil"],
    'colour':["Green","Blue","Red","Black"],
}
df2=pd.DataFrame(dicts2)
print(df2)

  colour
0  Green
1   Blue
2    Red
3  Black


In [109]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   items       4 non-null      object
 1   quantity    4 non-null      int64 
 2   unit price  4 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 228.0+ bytes


In [110]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   colour  4 non-null      object
dtypes: object(1)
memory usage: 164.0+ bytes


In [111]:
df.describe()

Unnamed: 0,quantity,unit price
count,4.0,4.0
mean,12.5,183.0
std,6.454972,344.745317
min,5.0,2.0
25%,8.75,8.0
50%,12.5,15.0
75%,16.25,190.0
max,20.0,700.0


In [114]:
frames=[df,df2]
res=pd.concat(frames,axis=1)
print(res)

    items  quantity  unit price colour
0  Laptop         5         700  Green
1   Mouse        10          20   Blue
2    Book        15          10    Red
3  Pencil        20           2  Black


In [115]:
res.duplicated()

0    False
1    False
2    False
3    False
dtype: bool

In [122]:
import pandas as pd

dict_try={
    'items':["Laptop","Mouse","Mouse","Book","Pencil"],
    'quantity':[5,10,10,15,20],
    'unit price':[700,20,20,10,2]
}
df3=pd.DataFrame(dict_try)
print(df3)

    items  quantity  unit price
0  Laptop         5         700
1   Mouse        10          20
2   Mouse        10          20
3    Book        15          10
4  Pencil        20           2


In [123]:
df3.duplicated('items')

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [124]:
df3.drop_duplicates()

Unnamed: 0,items,quantity,unit price
0,Laptop,5,700
1,Mouse,10,20
3,Book,15,10
4,Pencil,20,2


In [126]:
import kagglehub

df=pd.read_csv("BIKE DETAILS.csv")
print(df)

                                     name  selling_price  year seller_type  \
0               Royal Enfield Classic 350         175000  2019  Individual   
1                               Honda Dio          45000  2017  Individual   
2     Royal Enfield Classic Gunmetal Grey         150000  2018  Individual   
3       Yamaha Fazer FI V 2.0 [2016-2018]          65000  2015  Individual   
4                   Yamaha SZ [2013-2014]          20000  2011  Individual   
...                                   ...            ...   ...         ...   
1056                            Activa 3g          17000  2010  Individual   
1057                     Honda CB twister          16000  2012  Individual   
1058                   Bajaj Discover 125          15000  2013  Individual   
1059                       Honda CB Shine          12000  2009  Individual   
1060                     Bajaj Pulsar 150          10000  2008  Individual   

          owner  km_driven  ex_showroom_price  
0     1st owner

In [127]:
df.head()

Unnamed: 0,name,selling_price,year,seller_type,owner,km_driven,ex_showroom_price
0,Royal Enfield Classic 350,175000,2019,Individual,1st owner,350,
1,Honda Dio,45000,2017,Individual,1st owner,5650,
2,Royal Enfield Classic Gunmetal Grey,150000,2018,Individual,1st owner,12000,148114.0
3,Yamaha Fazer FI V 2.0 [2016-2018],65000,2015,Individual,1st owner,23000,89643.0
4,Yamaha SZ [2013-2014],20000,2011,Individual,2nd owner,21000,


In [128]:
df.tail()

Unnamed: 0,name,selling_price,year,seller_type,owner,km_driven,ex_showroom_price
1056,Activa 3g,17000,2010,Individual,1st owner,500000,52000.0
1057,Honda CB twister,16000,2012,Individual,1st owner,33000,51000.0
1058,Bajaj Discover 125,15000,2013,Individual,2nd owner,35000,57000.0
1059,Honda CB Shine,12000,2009,Individual,1st owner,53000,58000.0
1060,Bajaj Pulsar 150,10000,2008,Individual,1st owner,92233,75000.0


In [130]:
df.shape

(1061, 7)

In [131]:
df.index.values

array([   0,    1,    2, ..., 1058, 1059, 1060], shape=(1061,))

In [132]:
df.index

RangeIndex(start=0, stop=1061, step=1)

In [142]:

df=pd.read_csv("BIKE DETAILS.csv")
df['bike_country'] = 'nepal'
print(df.head(5))


                                  name  selling_price  year seller_type  \
0            Royal Enfield Classic 350         175000  2019  Individual   
1                            Honda Dio          45000  2017  Individual   
2  Royal Enfield Classic Gunmetal Grey         150000  2018  Individual   
3    Yamaha Fazer FI V 2.0 [2016-2018]          65000  2015  Individual   
4                Yamaha SZ [2013-2014]          20000  2011  Individual   

       owner  km_driven  ex_showroom_price bike_country  
0  1st owner        350                NaN        nepal  
1  1st owner       5650                NaN        nepal  
2  1st owner      12000           148114.0        nepal  
3  1st owner      23000            89643.0        nepal  
4  2nd owner      21000                NaN        nepal  


In [134]:
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
1056    False
1057    False
1058    False
1059    False
1060    False
Length: 1061, dtype: bool

In [136]:
df.head(100)

Unnamed: 0,name,selling_price,year,seller_type,owner,km_driven,ex_showroom_price
0,Royal Enfield Classic 350,175000,2019,Individual,1st owner,350,
1,Honda Dio,45000,2017,Individual,1st owner,5650,
2,Royal Enfield Classic Gunmetal Grey,150000,2018,Individual,1st owner,12000,148114.0
3,Yamaha Fazer FI V 2.0 [2016-2018],65000,2015,Individual,1st owner,23000,89643.0
4,Yamaha SZ [2013-2014],20000,2011,Individual,2nd owner,21000,
...,...,...,...,...,...,...,...
95,Honda Activa 4G,45000,2018,Individual,1st owner,25000,53079.0
96,Yamaha FZ25,150000,2019,Individual,1st owner,2500,133680.0
97,Bajaj Pulsar 150,42000,2015,Individual,1st owner,22770,
98,Hero Splendor Plus,30000,2015,Individual,1st owner,30000,


In [137]:
df.isnull()

Unnamed: 0,name,selling_price,year,seller_type,owner,km_driven,ex_showroom_price
0,False,False,False,False,False,False,True
1,False,False,False,False,False,False,True
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...
1056,False,False,False,False,False,False,False
1057,False,False,False,False,False,False,False
1058,False,False,False,False,False,False,False
1059,False,False,False,False,False,False,False


In [138]:
import pandas as pd

pd.set_option('display.max_rows', None)   # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)      # Auto-detect width of the terminal
pd.set_option('display.max_colwidth', None)  # Do not truncate column values

print(df)

                                            name  selling_price  year  \
0                      Royal Enfield Classic 350         175000  2019   
1                                      Honda Dio          45000  2017   
2            Royal Enfield Classic Gunmetal Grey         150000  2018   
3              Yamaha Fazer FI V 2.0 [2016-2018]          65000  2015   
4                          Yamaha SZ [2013-2014]          20000  2011   
5                               Honda CB Twister          18000  2010   
6                           Honda CB Hornet 160R          78500  2018   
7           Royal Enfield Bullet 350 [2007-2011]         180000  2008   
8                         Hero Honda CBZ extreme          30000  2010   
9                             Bajaj Discover 125          50000  2016   
10                                   Yamaha FZ16          35000  2015   
11                                    Honda Navi          28000  2016   
12                      Bajaj Avenger Street 220   