# Data Exploration

## Basic setup

In [23]:
import os
import pandas as pd

REL_DB_LOC = './clean/evs_per_capita.sqlite'
DB_CON = 'sqlite:///clean/evs_per_capita.sqlite'

data_found = os.path.exists(REL_DB_LOC)

if not data_found:
    raise SystemExit("No data found! Please execute 'data_pipeline.py' first!")


## Import the data

In [24]:
df = pd.read_sql('evs_per_capita', DB_CON)

## Data description

| **Column**     | **Description**                                                 |
|----------------|-----------------------------------------------------------------|
| federal_state  | Name of the federal state                                       |
| electric_total | Total no. of (purely) electric vehicle registrations            |
| hybrid_total   | Total no. of hybrid vehicle registrations                       |
| total          | Total no. of vehicle registrations                              |
| share_electric | Percentage of electric (including hybrid) vehicle registrations |
| gdp_per_capita | GDP per capita                                                  |

## Data statistics

In [25]:
df.describe()

Unnamed: 0,electric_total,hybrid_total,total,share_electric,gdp_per_capita
count,16.0,16.0,16.0,16.0,16.0
mean,1857.375,3713.8125,12676.8125,0.435513,44579.8125
std,2117.656027,4314.735654,14135.044427,0.057254,11537.411849
min,191.0,511.0,1349.0,0.3502,32837.0
25%,350.0,805.5,3143.75,0.391125,35584.25
50%,781.5,1891.0,6079.5,0.43975,41596.0
75%,3042.75,4616.5,20811.75,0.467,50808.75
max,6228.0,14286.0,43821.0,0.5324,76910.0


## Data types

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   federal_state   16 non-null     object 
 1   electric_total  16 non-null     int64  
 2   hybrid_total    16 non-null     int64  
 3   total           16 non-null     int64  
 4   share_electric  16 non-null     float64
 5   gdp_per_capita  16 non-null     int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 900.0+ bytes


## Example data

In [27]:
df.head()

Unnamed: 0,federal_state,electric_total,hybrid_total,total,share_electric,gdp_per_capita
0,Baden-Württemberg,4917,9795,29256,0.5029,50982
1,Bayern,5935,14286,43821,0.4614,53768
2,Berlin,903,2487,6367,0.5324,48147
3,Brandenburg,529,938,3349,0.438,34610
4,Bremen,191,512,1349,0.5211,56901
