In [10]:
!pip install -U gdown



In [11]:
import gdown

# ~~~ FILE LOCATIONS ~~~

file_id = '1AuMqjccgg354xLRmayryTjnIBTYHMDjV'             # 01_road_accidents_france_2019_2023_cleaning.csv
url = f'https://drive.google.com/uc?id={file_id}'

# ~~~ DOWNLOAD FILE ~~~

gdown.download(url, '01_road_accidents_france_2019_2023_cleaning.csv', quiet=False)

# ~~~ READ CVS FILE ~~~

import pandas as pd

# Full data
data = pd.read_csv('01_road_accidents_france_2019_2023_cleaning.csv', sep=';',
                   dtype={"num_traffic_lane": "string",
                          "age": "Int64"
                          },
                   parse_dates=["date", "datetime"])

# Reconvert "age_bucket" and "weekday" to dtype category
data["age_bucket"] = data["age_bucket"].astype("category")
data["weekday"] = data["weekday"].astype("category")
data['num_vehicle'] = data['num_vehicle'].astype("category")

# Replace NaA & convert "year_of_birth" to int64
data["year_of_birth"] = data["year_of_birth"].fillna(-1).astype('int64')

Downloading...
From (original): https://drive.google.com/uc?id=1AuMqjccgg354xLRmayryTjnIBTYHMDjV
From (redirected): https://drive.google.com/uc?id=1AuMqjccgg354xLRmayryTjnIBTYHMDjV&confirm=t&uuid=73a21cb3-551e-4aa5-ac7c-51d36d11b01d
To: /content/01_road_accidents_france_2019_2023_cleaning.csv
100%|██████████| 122M/122M [00:01<00:00, 82.0MB/s]
  data = pd.read_csv('01_road_accidents_france_2019_2023_cleaning.csv', sep=';',


This collab was used to prepare our code and dataset for integration into Power BI by restructuring it into a fact table (fact_user) and several dimension tables (dim_age, dim_department, dim_speedband, dim_severity). Before exporting, we cleaned and modified the data in Python to simplify the modeling process in Power BI (as the file would get too heavy if too many steps were required).

We began by creating a copy of the original dataset and isolating the columns needed for our fact table, renaming and replacing numeric codes (e.g., gender, user type, vehicle category, and road type) with descriptive labels to make the dataeasier to understand. Additionaly we created new fields, such as a has_helmet column derived from safety equipment data.

Next, we constructed individual dimension tables for age, department, speed band, and severity. Each dimension was cleaned and ordered (using sort_order columns) to ensure correct sorting in Power BI visuals. This preprocessing step ensured that the relationships between tables were clear, the dataset remained light in file size, and categorical values were understandable.

Overall, this collab reflects our decision to define the data model structure and labeling in Python first, enabling a more efficient and consistent data import process into Power BI.

In [12]:
# Copy of data just in case
data_copy = data.copy()

data_copy.info()

# FACT TABLE
fact_user = data_copy[[
    "num_accident",
    "date",
    "hour",
    "age",
    "category_user",
    "gender",
    "category_vehicle",
    "dep",
    "speed_limit",
    "category_road",
    "injury_severity",
    "safety_equipment1",
    "safety_equipment2",
    "safety_equipment3"
]].copy()

# Rename column date to date_key
fact_user.rename(columns={"date": "date_key"}, inplace=True)

# Modify column gender so that 1 is Male, 2 is Female, and -1 is Not specified
gender_labels = {   # Create a dictionary that assigns numbers -1, 1, and 2 to our mentioned labels
    1: "Male",
    2: "Female",
   -1: "Not specified"
}

fact_user["gender"] = fact_user["gender"].map(gender_labels) # Modify the column and tell it to base the value it puts with what it sees in our dictionary

# Modify column category_user so that 1 is Driver, 2 is Passenger, and 3 is Pedestrian
user_labels = {   # Create a dictionary that assigns numbers 1, 2, and 3 to our mentioned labels
    1: "Driver",
    2: "Passenger",
    3: "Pedestrian"
}

fact_user["category_user"] = fact_user["category_user"].map(user_labels) # Modify the column and tell it to base the value it puts with what it sees in our dictionary

# Modify column category_vehicle so that 1 is Bike, 80 is e-bike, and all other numbers (that we will first turn to NAs) will turn to Other vehicle
vehicle_map = {   # Create a dictionary that assigns numbers 1 and 80 to our mentioned labels
    1: "Bike",
    80: "e-bike"
}

fact_user["category_vehicle"] = fact_user["category_vehicle"].map(vehicle_map).fillna("Other vehicle") #  Modify the column and tell it to base the value it puts with what it sees in our dictionary and to replace NAs with "Other vehicle"

# Create has_helmet column using safety_equipment columns. If any has the number 2 return "Helmet" if not then "No helmet"
fact_user["has_helmet"] = fact_user[
    ["safety_equipment1", "safety_equipment2", "safety_equipment3"]
].apply(lambda row: "Helmet" if 2 in row.values else "No helmet", axis=1)

# Drop safety_equipment columns
fact_user = fact_user.drop(columns=["safety_equipment1", "safety_equipment2", "safety_equipment3"])

# For column dep replace all the "1, 2, 3" with "01, 02, 03"
fact_user["dep"] = fact_user["dep"].replace({"1": "01",
                                             "2": "02",
                                             "3": "03",
                                             "4": "04",
                                             "5": "05",
                                             "6": "06",
                                             "7": "07",
                                             "8": "08",
                                             "9": "09"})

# For column category_road change the numbers 5 and 6 by 9 (others)
fact_user["category_road"] = fact_user["category_road"].replace({5: 9, 6: 9})

# Modify column category_road so that 1 is Highway, 2 is National Road, and etc....
road_labels = {   # Create a dictionary that assigns numbers to our mentioned labels
    1: "Highway",
    2: "National Road",
    3: "Departmental Road",
    4: "Communal Way",
    7: "Urban Metropolitan Road",
    9: "Other"
}

fact_user["category_road"] = fact_user["category_road"].map(road_labels)

# DIMENSION TABLES

# *** Dim_Date Table ***
# WILL BE CREATED IN POWERBI

# *** Dim_Age Table ***
dim_age = data_copy[[
    "age",
    "age_bucket"
]].drop_duplicates().copy()

# Rename age_bucket column to age_bucket_label
dim_age.rename(columns={"age_bucket": "age_bucket_label"}, inplace=True)

# Since we have some ages missing we define full range of ages (including -1 for not specified)
full_ages = pd.DataFrame({"age": list(range(-1, 121))})

# Merge with dim_age (keeping all full_ages)
dim_age = full_ages.merge(dim_age, on="age", how="left")

# Convert to string first to avoid Categorical assignment error
dim_age["age_bucket_label"] = dim_age["age_bucket_label"].astype(str)

# As our column age_bucket was made before we added these new values in ages they for now appear in other columns as NA. Therefore we fix and fill the missing based on age
dim_age.loc[dim_age["age"] == -1, "age_bucket_label"] = "Not specified"
dim_age.loc[(dim_age["age"] >= 0) & (dim_age["age"] <= 13), "age_bucket_label"] = "0–13"
dim_age.loc[(dim_age["age"] >= 14) & (dim_age["age"] <= 17), "age_bucket_label"] = "14–17"
dim_age.loc[(dim_age["age"] >= 18) & (dim_age["age"] <= 24), "age_bucket_label"] = "18–24"
dim_age.loc[(dim_age["age"] >= 25) & (dim_age["age"] <= 34), "age_bucket_label"] = "25–34"
dim_age.loc[(dim_age["age"] >= 35) & (dim_age["age"] <= 44), "age_bucket_label"] = "35–44"
dim_age.loc[(dim_age["age"] >= 45) & (dim_age["age"] <= 54), "age_bucket_label"] = "45–54"
dim_age.loc[(dim_age["age"] >= 55) & (dim_age["age"] <= 64), "age_bucket_label"] = "55–64"
dim_age.loc[(dim_age["age"] >= 65) & (dim_age["age"] <= 74), "age_bucket_label"] = "65–74"
dim_age.loc[(dim_age["age"] >= 75) & (dim_age["age"] <= 84), "age_bucket_label"] = "75–84"
dim_age.loc[dim_age["age"] >= 85, "age_bucket_label"] = "85 and older"

# Create column sort_order to keep our age_bucket in order from smallest to largest in PowerBI
sort_order_map = {    # Create a dictionary that assigns numbers 1 to 10 to our speed band/range
    "Not specified": -1,
    "0–13": 1,
    "14–17": 2,
    "18–24": 3,
    "25–34": 4,
    "35–44": 5,
    "45–54": 6,
    "55–64": 7,
    "65–74": 8,
    "75–84": 9,
    "85 and older": 10
}

dim_age["sort_order"] = dim_age["age_bucket_label"].map(sort_order_map) # Create the column and tell it to base the value it puts with what it sees in column age_bucket (i.e. 1 for 0-13, 2 for 14-17, etc)

# Drop duplicates as it is a dimensions table
dim_age = dim_age.drop_duplicates().reset_index(drop=True)


# *** Dim_Department Table ***
dim_dep = fact_user[[
    "dep"
]].copy()

# Rename "dep" column to "dep_code"
dim_dep.rename(columns={"dep": "dep_code"}, inplace=True)

# Create dictionary associating our department codes to department names
dep_names = {
    "01": "Ain", "02": "Aisne", "03": "Allier", "04": "Alpes-de-Haute-Provence",
    "05": "Hautes-Alpes", "06": "Alpes-Maritimes", "07": "Ardèche", "08": "Ardennes", "09": "Ariège",
    "10": "Aube", "11": "Aude", "12": "Aveyron", "13": "Bouches-du-Rhône", "14": "Calvados",
    "15": "Cantal", "16": "Charente", "17": "Charente-Maritime", "18": "Cher", "19": "Corrèze",
    "2A": "Corse-du-Sud", "2B": "Haute-Corse", "21": "Côte-d'Or", "22": "Côtes-d'Armor",
    "23": "Creuse", "24": "Dordogne", "25": "Doubs", "26": "Drôme", "27": "Eure",
    "28": "Eure-et-Loir", "29": "Finistère", "30": "Gard", "31": "Haute-Garonne",
    "32": "Gers", "33": "Gironde", "34": "Hérault", "35": "Ille-et-Vilaine",
    "36": "Indre", "37": "Indre-et-Loire", "38": "Isère", "39": "Jura", "40": "Landes",
    "41": "Loir-et-Cher", "42": "Loire", "43": "Haute-Loire", "44": "Loire-Atlantique",
    "45": "Loiret", "46": "Lot", "47": "Lot-et-Garonne", "48": "Lozère",
    "49": "Maine-et-Loire", "50": "Manche", "51": "Marne", "52": "Haute-Marne",
    "53": "Mayenne", "54": "Meurthe-et-Moselle", "55": "Meuse", "56": "Morbihan",
    "57": "Moselle", "58": "Nièvre", "59": "Nord", "60": "Oise", "61": "Orne",
    "62": "Pas-de-Calais", "63": "Puy-de-Dôme", "64": "Pyrénées-Atlantiques",
    "65": "Hautes-Pyrénées", "66": "Pyrénées-Orientales", "67": "Bas-Rhin",
    "68": "Haut-Rhin", "69": "Rhône", "70": "Haute-Saône", "71": "Saône-et-Loire",
    "72": "Sarthe", "73": "Savoie", "74": "Haute-Savoie", "75": "Paris",
    "76": "Seine-Maritime", "77": "Seine-et-Marne", "78": "Yvelines",
    "79": "Deux-Sèvres", "80": "Somme", "81": "Tarn", "82": "Tarn-et-Garonne",
    "83": "Var", "84": "Vaucluse", "85": "Vendée", "86": "Vienne", "87": "Haute-Vienne",
    "88": "Vosges", "89": "Yonne", "90": "Territoire de Belfort", "91": "Essonne",
    "92": "Hauts-de-Seine", "93": "Seine-Saint-Denis", "94": "Val-de-Marne",
    "95": "Val-d'Oise", "971": "Guadeloupe", "972": "Martinique", "973": "Guyane", "974": "La Réunion",
    "975": "Saint-Pierre-et-Miquelon", "976": "Mayotte","977": "Saint-Barthélemy", "978": "Saint-Martin", "986": "Wallis-et-Futuna",
    "987": "Polynésie française", "988": "Nouvelle-Calédonie"
}

# Create column dep_names that will look at column dep_code and return the department name
dim_dep["dep_name"] = dim_dep["dep_code"].astype(str).map(dep_names)

# Create column dep_name_code that takes our dep_name + dep_code in one column. Format:"Name (Code)"
dim_dep["dep_name_code"] = dim_dep["dep_name"] + " (" + dim_dep["dep_code"].astype(str) + ")"

# Drop duplicates as it is a dimensions table
dim_dep = dim_dep.drop_duplicates().reset_index(drop=True)



# *** Dim_SpeedBand Table ***
dim_speedband = data_copy[[
    "speed_limit",
]].copy()

# We want a new column called speed_band that will represent our speed using the same ranges used in our graphs (i.e. 0-30, 31-, etc) First create our bins and labels then use pd.cut to assign each speed_limit to a band/range
bins = [-2, 30, 50, 70, 90, 110, float("inf")]
labels = ["0–30", "31–50", "51–70", "71–90", "91–110", "110+"]

dim_speedband["speed_band"] = pd.cut(dim_speedband["speed_limit"], # Create new column and assign each speed_limit to a band/range
                                     bins=bins,
                                     labels=labels,
                                     right=True,
                                     include_lowest=True)

# Convert to string so we can add "Not specified"
dim_speedband["speed_band"] = dim_speedband["speed_band"].astype(str)

# If we have a -1 then specifically in our speed_band column refer it to "not specified"
dim_speedband.loc[dim_speedband["speed_limit"] == -1, "speed_band"] = "Not specified"

# Create column sort_order to keep our speed_band in order from smallest to largest in PowerBI
sort_order_map = {    # Create a dictionary that assigns numbers 1 to 6 to our speed band/range
    "0–30": 1,
    "31–50": 2,
    "51–70": 3,
    "71–90": 4,
    "91–110": 5,
    "110+": 6,
    "Not specified": 7
}

dim_speedband["sort_order"] = dim_speedband["speed_band"].map(sort_order_map) # Create the column and tell it to base the value it puts with what it sees in column speed_band (i.e. 1 for 0-30, 2 for 31-50, etc)

# Drop duplicates as it is a dimensions table
dim_speedband = dim_speedband.drop_duplicates().reset_index(drop=True)


# *** Dim_Severity Table ***
dim_severity = data_copy[[
    "injury_severity",
]].copy()

# We want to create a column called severity_name that will represent our injury severity with "Not specified", "Unscathed" "Hospitalized Wounded", and "Killed", "Light injured".
severity_name = {
    -1: "Not specified",  # Create a dictionary that assigns -1, 1, 2, 3, 4 to our injury labels
    1: "Unscathed",
    2: "Killed",
    3: "Hospitalized wounded",
    4: "Light injury"
}

# Create column injury_severity_name that will look at column injury_severity and return the label name set
dim_severity["injury_severity_name"] = dim_severity["injury_severity"].map(severity_name)

# We want to create a column called is_severe that will say "severe" if injury_severity is 2 or 3 and "non_severe" for the others
dim_severity["is_severe"] = dim_severity["injury_severity"].apply(lambda x: "Severe" if x in [2, 3] else "Non-severe")

# Create column severity_name_order to keep our severity_labels in order from smallest to largest in PowerBI
severity_name_order = {
    "Not specified": 1,
    "Unscathed": 2,
    "Killed": 5,
    "Hospitalized wounded": 4,
    "Light injury": 3
}

dim_severity["severity_name_order"] = dim_severity["injury_severity_name"].map(severity_name_order) # Create the column and tell it to base the value it puts with what it sees in column severity_label (i.e. 1 for Non-Severe, 2 for Hospitalized wounded, etc)

# We want to create a column called severity_label that will represent our injury severity with "Non-Sever", "Hospitalized Wounded", and "Killed".
severity_label = {
    -1: "Non-severe",  # Create a dictionary that assigns -1, 1, 2, 3, 4 to our severity labels
    1: "Non-severe",
    2: "Killed",
    3: "Hospitalized wounded",
    4: "Non-severe"
}

# Create column injury_severity_name that will look at column injury_severity and return the label name set
dim_severity["severity_label"] = dim_severity["injury_severity"].map(severity_label)

# Create column severity_label_order to keep our severity_labels in order from smallest to largest in PowerBI
severity_label_order = {
    "Non-severe": 1,
    "Killed": 2,
    "Hospitalized wounded": 3
}

dim_severity["severity_label_order"] = dim_severity["severity_label"].map(severity_label_order)

dim_severity = dim_severity.drop_duplicates().reset_index(drop=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 619807 entries, 0 to 619806
Data columns (total 46 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   num_accident       619807 non-null  int64         
 1   id_vehicle         619807 non-null  int64         
 2   num_vehicle        619807 non-null  category      
 3   category_user      619807 non-null  int64         
 4   injury_severity    619807 non-null  int64         
 5   gender             619807 non-null  int64         
 6   year_of_birth      619807 non-null  int64         
 7   trip_purpose       619807 non-null  int64         
 8   safety_equipment1  619807 non-null  int64         
 9   safety_equipment2  619807 non-null  int64         
 10  safety_equipment3  619807 non-null  int64         
 11  category_vehicle   619807 non-null  int64         
 12  fixed_obs_hit      619807 non-null  int64         
 13  mobile_obs_hit     619807 non-null  int64   

In [13]:
display(fact_user.head(15))
print(fact_user.info())

# Verify that the numbers 5 and 6 no longer appear
data_copy["category_road"].value_counts().sort_index() # 5 and 6 are present
fact_user["category_road"].value_counts().sort_index() # 5 and 6 are no longer present, total of 9 has 5 and 6

# Verify gender, category vehicle, has_helmet.
display(fact_user["gender"].value_counts())
display(fact_user["category_vehicle"].value_counts())
display(fact_user["has_helmet"].value_counts())


Unnamed: 0,num_accident,date_key,hour,age,category_user,gender,category_vehicle,dep,speed_limit,category_road,injury_severity,has_helmet
0,201900000001,2019-11-30,1,17,Passenger,Female,Other vehicle,93,70,Highway,4,No helmet
1,201900000001,2019-11-30,1,26,Driver,Female,Other vehicle,93,70,Highway,4,No helmet
2,201900000001,2019-11-30,1,60,Driver,Male,Other vehicle,93,70,Highway,1,No helmet
3,201900000002,2019-11-30,2,25,Driver,Female,Other vehicle,93,70,Highway,4,No helmet
4,201900000003,2019-11-28,15,23,Driver,Male,Other vehicle,92,90,Highway,1,No helmet
5,201900000003,2019-11-28,15,89,Passenger,Female,Other vehicle,92,90,Highway,4,No helmet
6,201900000003,2019-11-28,15,24,Driver,Male,Other vehicle,92,90,Highway,4,No helmet
7,201900000003,2019-11-28,15,53,Driver,Male,Other vehicle,92,90,Highway,1,No helmet
8,201900000004,2019-11-30,20,26,Driver,Male,Other vehicle,94,90,Highway,1,No helmet
9,201900000004,2019-11-30,20,51,Driver,Male,Other vehicle,94,90,Highway,1,No helmet


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 619807 entries, 0 to 619806
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   num_accident      619807 non-null  int64         
 1   date_key          619807 non-null  datetime64[ns]
 2   hour              619807 non-null  int64         
 3   age               611268 non-null  Int64         
 4   category_user     619807 non-null  object        
 5   gender            619807 non-null  object        
 6   category_vehicle  619807 non-null  object        
 7   dep               619807 non-null  object        
 8   speed_limit       619807 non-null  int64         
 9   category_road     619807 non-null  object        
 10  injury_severity   619807 non-null  int64         
 11  has_helmet        619807 non-null  object        
dtypes: Int64(1), datetime64[ns](1), int64(4), object(6)
memory usage: 57.3+ MB
None


Unnamed: 0_level_0,count
gender,Unnamed: 1_level_1
Male,417805
Female,193765
Not specified,8237


Unnamed: 0_level_0,count
category_vehicle,Unnamed: 1_level_1
Other vehicle,589388
Bike,27619
e-bike,2800


Unnamed: 0_level_0,count
has_helmet,Unnamed: 1_level_1
No helmet,505602
Helmet,114205


In [14]:
# Sort by age in ascending order and display
dim_age = dim_age.sort_values(by="age", ascending=True).reset_index(drop=True)
dim_age["age"].unique()
display(dim_age.head(35))
print(dim_age.info())


Unnamed: 0,age,age_bucket_label,sort_order
0,-1,Not specified,-1
1,0,0–13,1
2,1,0–13,1
3,2,0–13,1
4,3,0–13,1
5,4,0–13,1
6,5,0–13,1
7,6,0–13,1
8,7,0–13,1
9,8,0–13,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   age               122 non-null    int64 
 1   age_bucket_label  122 non-null    object
 2   sort_order        122 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 3.0+ KB
None


In [15]:
dim_dep = dim_dep.sort_values(by="dep_code", ascending=True).reset_index(drop=True)
display(dim_dep.head(25))
print(dim_dep.info())
print(dim_dep['dep_code'].unique())


# dep_name should have 9 less rows than data_copy "dep" since first 9 departments are represented as 01 or 1
data_copy["dep"].value_counts().sort_index() # 107 Rows
dim_dep["dep_name"].value_counts().sort_index() # 107 Rows

Unnamed: 0,dep_code,dep_name,dep_name_code
0,1,Ain,Ain (01)
1,2,Aisne,Aisne (02)
2,3,Allier,Allier (03)
3,4,Alpes-de-Haute-Provence,Alpes-de-Haute-Provence (04)
4,5,Hautes-Alpes,Hautes-Alpes (05)
5,6,Alpes-Maritimes,Alpes-Maritimes (06)
6,7,Ardèche,Ardèche (07)
7,8,Ardennes,Ardennes (08)
8,9,Ariège,Ariège (09)
9,10,Aube,Aube (10)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107 entries, 0 to 106
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   dep_code       107 non-null    object
 1   dep_name       107 non-null    object
 2   dep_name_code  107 non-null    object
dtypes: object(3)
memory usage: 2.6+ KB
None
['01' '02' '03' '04' '05' '06' '07' '08' '09' '10' '11' '12' '13' '14'
 '15' '16' '17' '18' '19' '21' '22' '23' '24' '25' '26' '27' '28' '29'
 '2A' '2B' '30' '31' '32' '33' '34' '35' '36' '37' '38' '39' '40' '41'
 '42' '43' '44' '45' '46' '47' '48' '49' '50' '51' '52' '53' '54' '55'
 '56' '57' '58' '59' '60' '61' '62' '63' '64' '65' '66' '67' '68' '69'
 '70' '71' '72' '73' '74' '75' '76' '77' '78' '79' '80' '81' '82' '83'
 '84' '85' '86' '87' '88' '89' '90' '91' '92' '93' '94' '95' '971' '972'
 '973' '974' '975' '976' '977' '978' '986' '987' '988']


Unnamed: 0_level_0,count
dep_name,Unnamed: 1_level_1
Ain,1
Aisne,1
Allier,1
Alpes-Maritimes,1
Alpes-de-Haute-Provence,1
...,...
Vienne,1
Vosges,1
Wallis-et-Futuna,1
Yonne,1


In [16]:
dim_speedband = dim_speedband.sort_values(by="speed_limit", ascending=True).reset_index(drop=True)
display(dim_speedband.head(15))
print(dim_speedband.info())

display(data_copy['speed_limit'].value_counts())

Unnamed: 0,speed_limit,speed_band,sort_order
0,-1,Not specified,7
1,0,0–30,1
2,1,0–30,1
3,2,0–30,1
4,3,0–30,1
5,4,0–30,1
6,5,0–30,1
7,6,0–30,1
8,7,0–30,1
9,8,0–30,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56 entries, 0 to 55
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   speed_limit  56 non-null     int64 
 1   speed_band   56 non-null     object
 2   sort_order   56 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 1.4+ KB
None


Unnamed: 0_level_0,count
speed_limit,Unnamed: 1_level_1
50,307280
80,86317
30,67197
90,49883
70,47144
110,25921
130,16544
-1,9258
40,2678
60,2238


In [17]:
dim_severity = dim_severity.sort_values(by="injury_severity", ascending=True).reset_index(drop=True)

display(dim_severity.head(15))
print(dim_severity.info())

Unnamed: 0,injury_severity,injury_severity_name,is_severe,severity_name_order,severity_label,severity_label_order
0,-1,Not specified,Non-severe,1,Non-severe,1
1,1,Unscathed,Non-severe,2,Non-severe,1
2,2,Killed,Severe,5,Killed,2
3,3,Hospitalized wounded,Severe,4,Hospitalized wounded,3
4,4,Light injury,Non-severe,3,Non-severe,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   injury_severity       5 non-null      int64 
 1   injury_severity_name  5 non-null      object
 2   is_severe             5 non-null      object
 3   severity_name_order   5 non-null      int64 
 4   severity_label        5 non-null      object
 5   severity_label_order  5 non-null      int64 
dtypes: int64(3), object(3)
memory usage: 372.0+ bytes
None


In [18]:
fact_user.to_csv("Fact_User.csv", index=False)
dim_age.to_csv("Dim_Age.csv", index=False)
dim_dep.to_csv("Dim_Department.csv", index=False)
dim_speedband.to_csv("Dim_SpeedBand.csv", index=False)
dim_severity.to_csv("Dim_Severity.csv", index=False)