In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle
from fastparquet import ParquetFile
from fastparquet import write
import pyarrow as pa




### Use the trained model with “real” data

In [3]:
df = pd.read_csv('troop_movements10m.csv')
df

Unnamed: 0,timestamp,unit_id,unit_type,location_x,location_y,destination_x,destination_y,homeworld
0,2023-06-13 17:33:18,1,at-st,2.0,8.0,1,1,Glee Anselm
1,2023-06-13 17:33:17,2,tie_silencer,4.0,4.0,0,1,Trandosha
2,2023-06-13 17:33:16,3,at-at,0.0,3.0,6,1,Corellia
3,2023-06-13 17:33:15,4,tie_silencer,6.0,1.0,6,9,Shili
4,2023-06-13 17:33:14,5,tie_fighter,0.0,4.0,9,6,Muunilinst
...,...,...,...,...,...,...,...,...
9999995,2023-02-18 10:26:20,9999996,resistance_soldier,6.0,6.0,1,3,Troiken
9999996,2023-02-18 10:26:19,9999997,tie_silencer,8.0,6.0,2,0,Kashyyyk
9999997,2023-02-18 10:26:18,9999998,tie_fighter,7.0,7.0,6,4,Kashyyyk
9999998,2023-02-18 10:26:17,9999999,tie_fighter,6.0,6.0,8,8,Kalee


### Some unit_type records have a value of invalid_unit. Replace that with unknown

In [9]:
unit_type = df['unit_type'].value_counts()
unit_type

x-wing                1428412
stormtrooper          1428291
tie_silencer          1428016
tie_fighter           1426809
at-at                 1426525
at-st                 1426173
resistance_soldier    1425774
invalid_unit            10000
Name: unit_type, dtype: int64

In [8]:
inplace = df.replace(to_replace='invalid_unit', value="unknown", inplace=False)
inplace

Unnamed: 0,timestamp,unit_id,unit_type,location_x,location_y,destination_x,destination_y,homeworld
0,2023-06-13 17:33:18,1,at-st,2.0,8.0,1,1,Glee Anselm
1,2023-06-13 17:33:17,2,tie_silencer,4.0,4.0,0,1,Trandosha
2,2023-06-13 17:33:16,3,at-at,0.0,3.0,6,1,Corellia
3,2023-06-13 17:33:15,4,tie_silencer,6.0,1.0,6,9,Shili
4,2023-06-13 17:33:14,5,tie_fighter,0.0,4.0,9,6,Muunilinst
...,...,...,...,...,...,...,...,...
9999995,2023-02-18 10:26:20,9999996,resistance_soldier,6.0,6.0,1,3,Troiken
9999996,2023-02-18 10:26:19,9999997,tie_silencer,8.0,6.0,2,0,Kashyyyk
9999997,2023-02-18 10:26:18,9999998,tie_fighter,7.0,7.0,6,4,Kashyyyk
9999998,2023-02-18 10:26:17,9999999,tie_fighter,6.0,6.0,8,8,Kalee


In [11]:
unit_type = inplace['unit_type'].value_counts()
unit_type

x-wing                1428412
stormtrooper          1428291
tie_silencer          1428016
tie_fighter           1426809
at-at                 1426525
at-st                 1426173
resistance_soldier    1425774
unknown                 10000
Name: unit_type, dtype: int64

### Some location_x and location_Y values are missing. Use the ffill method to fill

In [18]:

inplace['location_x'].isna().value_counts()


False    9999500
True         500
Name: location_x, dtype: int64

In [20]:
 inplace['location_x'].ffill(inplace=True)

In [21]:

inplace['location_x'].isna().value_counts()

False    10000000
Name: location_x, dtype: int64

In [23]:
inplace['location_y'].isna().value_counts()

False    9999500
True         500
Name: location_y, dtype: int64

In [24]:
inplace['location_x'].ffill(inplace=True)
inplace['location_x'].isna().value_counts()

False    10000000
Name: location_x, dtype: int64

### Save the clean data into a Parquet file named troop_movements10m.parquet

In [25]:
# Save DataFrame to Parquet file
inplace.to_parquet('clean_data.parquet')