# Pandas

In [1]:
# install pandas
!pip install pandas



In [3]:
# import libraries
import pandas as pd

## Series
A Series is a one-dimensional array but with an index

Creating series from a list

In [7]:
data = [10, 20, 30, 40, 50]

series = pd.Series(data)
series

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [20]:
Names = ['Dan', 'Tee', 'Kay']
Salary = [50000.00, 80000.00, 80000.00]
series = pd.Series(Salary, index=Names, name='Salary Scale')
convert = series.to_frame()
print(convert)

     Salary Scale
Dan       50000.0
Tee       80000.0
Kay       80000.0


## DataFrame
DataFrame is a 2D table with labeled rows and columns

Creating DataFrame from a Dictionary

In [22]:
# DataFrame as dictionary
data = {
    'Friut': ['Apple', 'Orange', 'Banana'],
    'Colour': ['Green', 'Orange', 'Yellow'],
    'Price': [200, 150, 100]
}
df_dict = pd.DataFrame(data)
df_dict

Unnamed: 0,Friut,Colour,Price
0,Apple,Green,200
1,Orange,Orange,150
2,Banana,Yellow,100


In [24]:
# DataFrame as tuple
data2 = [('Daniel', 30, 'Nigeria'), ('David', 26, 'Zambia'), ('Sofia', 28, 'Gabon')]
data2
pd.DataFrame(data2, columns=['Names', 'Age', 'Country'])

Unnamed: 0,Names,Age,Country
0,Daniel,30,Nigeria
1,David,26,Zambia
2,Sofia,28,Gabon


In [28]:
# DataFrame as list
Places = ['Lagos', 'Paris', 'London', 'Johannesburg']
Language = ['English', 'French', 'English', 'Zulu']
Populations = [20000000, 5000000, 14000000, 8000000]

df2 = pd.DataFrame({
    'States': Places,
    'Native_language': Language,
    'Population': Populations
})
df2

Unnamed: 0,States,Native_language,Population
0,Lagos,English,20000000
1,Paris,French,5000000
2,London,English,14000000
3,Johannesburg,Zulu,8000000


### Basic Operations on DataFrame and Series

In [32]:
df2.head(2)

Unnamed: 0,States,Native_language,Population
0,Lagos,English,20000000
1,Paris,French,5000000


In [34]:
df2.tail(2)

Unnamed: 0,States,Native_language,Population
2,London,English,14000000
3,Johannesburg,Zulu,8000000


In [44]:
df2['Native_language']

0    English
1     French
2    English
3       Zulu
Name: Native_language, dtype: object

In [38]:
df2['Choice'] = df2['Population']>6000000

In [40]:
df2

Unnamed: 0,States,Native_language,Population,Choice
0,Lagos,English,20000000,True
1,Paris,French,5000000,False
2,London,English,14000000,True
3,Johannesburg,Zulu,8000000,True


In [46]:
High_Price = df2[df2['Population']>8000000]
High_Price

Unnamed: 0,States,Native_language,Population,Choice
0,Lagos,English,20000000,True
2,London,English,14000000,True


## Data Importing and Exporting

### Importing of data

In [50]:
# The importation and exportation of datasets using the read function

data_location = pd.read_csv('datasets/locations.csv')
data_location.head()

Unnamed: 0,location,continent,population,life_expectancy,hospital_beds_per_thousand,gdp_per_capita
0,Afghanistan,Asia,38928341.0,64.83,0.5,1803.987
1,Albania,Europe,2877800.0,78.57,2.89,11803.431
2,Algeria,Africa,43851043.0,76.88,1.9,13913.839
3,Andorra,Europe,77265.0,83.73,,
4,Angola,Africa,32866268.0,61.15,,5819.495


In [59]:
# Importing datasets online using url
url_data = "https://raw.githubusercontent.com/Oyeniran20/Machine-Learning/main/6.%20Trees/housing.csv"
url_data

data_housing = pd.read_csv(url_data)
data_housing

data_housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [121]:
# Importing .json extension file

# This is a dataset that is not clean
unclean_phones_data = pd.read_json('datasets/phones.json')
unclean_phones_data

unclean_phones_data.head()

Unnamed: 0,phone_brand,phone_model,price,specs,pricing
0,itel,itel Smart Watch 1,,"{'Network': {'2G bands': ' N/A', '3G bands': '...",
1,oukitel,Oukitel WP19,About 380 EUR,{'Network': {'2G bands': 'GSM 850 / 900 / 1800...,
2,cubot,Cubot Smart Watch,,"{'Network': {'2G bands': ' N/A', '3G bands': '...",
3,cubot,Cubot ID206,,"{'Network': {'2G bands': ' N/A', '3G bands': '...",
4,tcl,TCL Plex,About 330 EUR,{'Network': {'2G bands': 'GSM 850 / 900 / 1800...,


In [65]:
# This is a processed phone dataset
clean_phones_data = pd.read_csv('datasets/processed_phones.csv')
clean_phones_data

clean_phones_data.head()

Unnamed: 0,phone_brand,phone_model,store,price_usd,storage,ram,launch_date,dimensions,weight,display_type,...,price_range,os_type,os_version,battery_size,colors_available,chip_company,cpu_core,gpu_company,fingerprint,video_resolution
0,apple,Apple iPhone 16 Pro,Amazon DE,1357.55,256,8,2024-09-20,149.6 x 71.5 x 8.3 mm (5.89 x 2.81 x 0.33 in),199.0,"LTPO Super Retina XDR OLED, 120Hz, HDR10, Dolb...",...,medium price,iOS,18.0,Medium,4,Apple,Hexa-core,Apple,Face,4K
1,apple,Apple iPhone 16 Pro,Amazon DE,1492.55,512,8,2024-09-20,149.6 x 71.5 x 8.3 mm (5.89 x 2.81 x 0.33 in),199.0,"LTPO Super Retina XDR OLED, 120Hz, HDR10, Dolb...",...,high price,iOS,18.0,Medium,4,Apple,Hexa-core,Apple,Face,4K
2,apple,Apple iPhone 16 Pro,Amazon DE,1705.32,1000,8,2024-09-20,149.6 x 71.5 x 8.3 mm (5.89 x 2.81 x 0.33 in),199.0,"LTPO Super Retina XDR OLED, 120Hz, HDR10, Dolb...",...,high price,iOS,18.0,Medium,4,Apple,Hexa-core,Apple,Face,4K
3,apple,Apple iPhone 16 Pro Max,Amazon DE,1564.92,512,8,2024-09-20,163 x 77.6 x 8.3 mm (6.42 x 3.06 x 0.33 in),227.0,"LTPO Super Retina XDR OLED, 120Hz, HDR10, Dolb...",...,high price,iOS,18.0,Large,4,Apple,Hexa-core,Apple,Face,4K
4,apple,Apple iPhone 12 mini,Amazon DE,247.32,128,4,2020-11-13,131.5 x 64.2 x 7.4 mm (5.18 x 2.53 x 0.29 in),135.0,"Super Retina XDR OLED, HDR10, Dolby Vision, 62...",...,medium price,iOS,14.1,Small,6,Apple,Hexa-core,Apple,Face,4K


In [69]:
# Importing a dataset from a .sql file
import sqlite3
print(sqlite3.sqlite_version)

3.45.3


In [94]:
# Connecting to sqlite database
db_conn = sqlite3.connect('datasets/wild_fires.sqlite')

# Query to list only tables
tables_query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = pd.read_sql(tables_query, db_conn)

# Display the table names
print("Tables in the database:")
print(tables)

db_conn.close()

Tables in the database:
                                  name
0                      spatial_ref_sys
1                   spatialite_history
2                      sqlite_sequence
3                     geometry_columns
4                  spatial_ref_sys_aux
5               views_geometry_columns
6               virts_geometry_columns
7          geometry_columns_statistics
8    views_geometry_columns_statistics
9    virts_geometry_columns_statistics
10        geometry_columns_field_infos
11  views_geometry_columns_field_infos
12  virts_geometry_columns_field_infos
13               geometry_columns_time
14               geometry_columns_auth
15         views_geometry_columns_auth
16         virts_geometry_columns_auth
17                  sql_statements_log
18                        SpatialIndex
19                ElementaryGeometries
20                                 KNN
21                               Fires
22                     idx_Fires_Shape
23                idx_Fires_Shape_node
2

In [96]:
# Path to the SpatiaLite library
spatialite_lib = "/usr/local/Cellar/libspatialite/5.1.0_1/lib/mod_spatialite.dylib"

# Connect to the SQLite database
db_conn = sqlite3.connect('datasets/wild_fires.sqlite')

# Enable loading of extensions
db_conn.enable_load_extension(True)

# Load the SpatiaLite extension
try:
    db_conn.execute(f"SELECT load_extension('{spatialite_lib}');")
    print("SpatiaLite extension loaded successfully.")
except Exception as e:
    print(f"Failed to load SpatiaLite extension: {e}")

# Get the list of tables
tables_query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = pd.read_sql(tables_query, db_conn)

# Loop through each table and display its content
for table_name in tables['name']:
    print(f"\n--- Data from table: {table_name} ---")
    try:
        # Fetch first 5 rows from the table
        table_data = pd.read_sql(f"SELECT * FROM {table_name} LIMIT 5;", db_conn)
        print(table_data)
    except Exception as e:
        print(f"Could not fetch data from table {table_name}: {e}")

db_conn.close()

SpatiaLite extension loaded successfully.

--- Data from table: spatial_ref_sys ---
   srid auth_name  auth_srid                              ref_sys_name  \
0    -1      NONE         -1                     Undefined - Cartesian   
1     0      NONE          0           Undefined - Geographic Long/Lat   
2  2000      epsg       2000  Anguilla 1957 / British West Indies Grid   
3  2001      epsg       2001   Antigua 1943 / British West Indies Grid   
4  2002      epsg       2002  Dominica 1945 / British West Indies Grid   

                                           proj4text  \
0                                                      
1                                                      
2  +proj=tmerc +lat_0=0 +lon_0=-62 +k=0.999500000...   
3  +proj=tmerc +lat_0=0 +lon_0=-62 +k=0.999500000...   
4  +proj=tmerc +lat_0=0 +lon_0=-62 +k=0.999500000...   

                                              srtext  
0                                          Undefined  
1                       

In [98]:
# Importing a .tsv extension file

movies_data = pd.read_csv('datasets/movie_titles_metadata.tsv', sep='\t')
movies_data.head()

Unnamed: 0,m0,10 things i hate about you,1999,6.90,62847,['comedy' 'romance']
0,m1,1492: conquest of paradise,1992,6.2,10421.0,['adventure' 'biography' 'drama' 'history']
1,m2,15 minutes,2001,6.1,25854.0,['action' 'crime' 'drama' 'thriller']
2,m3,2001: a space odyssey,1968,8.4,163227.0,['adventure' 'mystery' 'sci-fi']
3,m4,48 hrs.,1982,6.9,22289.0,['action' 'comedy' 'crime' 'drama' 'thriller']
4,m5,the fifth element,1997,7.5,133756.0,['action' 'adventure' 'romance' 'sci-fi' 'thri...


In [104]:
# Columns name
col = ['sn', 'name', 'release_year', 'ratings','voting', 'genre']

movies_data = pd.read_csv('datasets/movie_titles_metadata.tsv', sep='\t', names=col)
movies_data.head()

Unnamed: 0,sn,name,release_year,ratings,voting,genre
0,m0,10 things i hate about you,1999,6.9,62847.0,['comedy' 'romance']
1,m1,1492: conquest of paradise,1992,6.2,10421.0,['adventure' 'biography' 'drama' 'history']
2,m2,15 minutes,2001,6.1,25854.0,['action' 'crime' 'drama' 'thriller']
3,m3,2001: a space odyssey,1968,8.4,163227.0,['adventure' 'mystery' 'sci-fi']
4,m4,48 hrs.,1982,6.9,22289.0,['action' 'comedy' 'crime' 'drama' 'thriller']


### Functions and Attributes

In [107]:
# Getting the shape of a datasets
movies_data.shape

(617, 6)

In [116]:
# Getting all info on the datasets
movies_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 617 entries, 0 to 616
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sn            617 non-null    object 
 1   name          616 non-null    object 
 2   release_year  616 non-null    object 
 3   ratings       616 non-null    float64
 4   voting        616 non-null    float64
 5   genre         616 non-null    object 
dtypes: float64(2), object(4)
memory usage: 29.1+ KB


### Exporting of data

In [123]:
# Converting .json file to csv
unclean_phones_data.to_csv('unclean_phones_data.csv')

In [125]:
# Converting sqlite to xlsx

# Connect to the SQLite database
db_conn = sqlite3.connect('datasets/wild_fires.sqlite')

# Load the SpatiaLite extension
db_conn.enable_load_extension(True)
db_conn.load_extension('/usr/local/Cellar/libspatialite/5.1.0_1/lib/mod_spatialite.dylib')  # Path to mod_spatialite.dylib

# Get the list of tables
tables_query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = pd.read_sql(tables_query, db_conn)

# Loop through each table and export to Excel
for table_name in tables['name']:
    print(f"Exporting data from table: {table_name}...")
    try:
        # Fetch all data from the table
        table_data = pd.read_sql(f"SELECT * FROM {table_name};", db_conn)
        
        # Define Excel filename (you can customize this)
        excel_filename = f"{table_name}.xlsx"
        
        # Export to Excel
        table_data.to_excel(excel_filename, index=False)
        print(f"Data exported to {excel_filename}")
    except Exception as e:
        print(f"Could not fetch data from table {table_name}: {e}")

# Close the database connection
db_conn.close()

Exporting data from table: spatial_ref_sys...
Data exported to spatial_ref_sys.xlsx
Exporting data from table: spatialite_history...
Data exported to spatialite_history.xlsx
Exporting data from table: sqlite_sequence...
Data exported to sqlite_sequence.xlsx
Exporting data from table: geometry_columns...
Data exported to geometry_columns.xlsx
Exporting data from table: spatial_ref_sys_aux...
Data exported to spatial_ref_sys_aux.xlsx
Exporting data from table: views_geometry_columns...
Data exported to views_geometry_columns.xlsx
Exporting data from table: virts_geometry_columns...
Data exported to virts_geometry_columns.xlsx
Exporting data from table: geometry_columns_statistics...
Data exported to geometry_columns_statistics.xlsx
Exporting data from table: views_geometry_columns_statistics...
Data exported to views_geometry_columns_statistics.xlsx
Exporting data from table: virts_geometry_columns_statistics...
Data exported to virts_geometry_columns_statistics.xlsx
Exporting data from 