# Data Exploration

## Basic setup

In [1]:
import os
import pandas as pd

REL_DB_LOC = './clean/evs_per_capita.sqlite'
DB_CON = 'sqlite:///clean/evs_per_capita.sqlite'

data_found = os.path.exists(REL_DB_LOC)

if not data_found:
    raise SystemExit("No data found! Please execute 'data_pipeline.py' first!")


## Import the data

In [2]:
df = pd.read_sql('evs_per_capita', DB_CON)

## Data description

| **Column**     | **Description**                                                 |
|----------------|-----------------------------------------------------------------|
| federal_state  | Name of the federal state                                       |
| electric_total | Total no. of (purely) electric vehicle registrations            |
| hybrid_total   | Total no. of hybrid vehicle registrations                       |
| total          | Total no. of vehicle registrations                              |
| share_electric | Percentage of electric (including hybrid) vehicle registrations |
| gdp_per_capita | GDP per capita                                                  |

## Data statistics

In [3]:
df.describe()

Unnamed: 0,electric_total,total,hybrid_total,share_electric,gdp_per_capita
count,16.0,16.0,16.0,16.0,16.0
mean,7774.5625,54329.1875,15815.125,0.430856,44579.8125
std,8957.123586,60380.779832,17993.1496,0.050083,11537.411849
min,770.0,5809.0,2243.0,0.3417,32837.0
25%,1506.0,14496.5,3846.75,0.39195,35584.25
50%,3089.5,26517.0,8280.5,0.4375,41596.0
75%,11954.5,86040.25,19812.5,0.450975,50808.75
max,27086.0,185102.0,55845.0,0.5187,76910.0


## Data types

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   federal_state   16 non-null     object 
 1   electric_total  16 non-null     int64  
 2   total           16 non-null     int64  
 3   hybrid_total    16 non-null     int64  
 4   share_electric  16 non-null     float64
 5   gdp_per_capita  16 non-null     int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 900.0+ bytes


## Example data

In [5]:
df.head()

Unnamed: 0,federal_state,electric_total,total,hybrid_total,share_electric,gdp_per_capita
0,Baden-Württemberg,20872,127339,42351,0.4965,50982
1,Bayern,24662,185102,55845,0.4349,53768
2,Berlin,3323,25716,9598,0.5024,48147
3,Brandenburg,2185,15507,4498,0.431,34610
4,Bremen,770,5809,2243,0.5187,56901
