In [93]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [94]:
from tqdm.notebook import tqdm, trange
import time  

In [95]:
df = pd.read_csv('mil_data_cleaned.csv')
df.dtypes

  exec(code_obj, self.user_global_ns, self.user_ns)


training_group          object
country                 object
deliv_unit              object
recip_unit              object
start_date              object
end_date                object
cost                   float64
start_year             float64
quant_one                int64
log_training_length    float64
has_deliv                int64
DOS                      int64
DOD                      int64
IMET                     int64
FMF                      int64
CTFP                     int64
Regional_Center          int64
language_course          int64
MET                      int64
MTT                      int64
seminar                  int64
terrorism                int64
human_rights             int64
conference               int64
relations                int64
policy                   int64
in_US                    int64
dtype: object

In [96]:
#get dummies for training_group
dummies = pd.get_dummies(df['training_group'])

df = df.drop('training_group',axis = 1)
# Join the encoded df
df = df.join(dummies)
df.head()

Unnamed: 0,country,deliv_unit,recip_unit,start_date,end_date,cost,start_year,quant_one,log_training_length,has_deliv,...,Africa,All_Regions,East_Asia_And_Pacific,East_Asia_and_Pacific,Europe,Near_East,Newly_Independent_States,South_Asia,South_Central_Asia,Western_Hemisphere
0,Angola,,MINISTRY OF FOREIGN AFFAIRS MOD,2001-01-29,2001-02-09,9667.0,2001.0,0,2.484907,0,...,1,0,0,0,0,0,0,0,0,0
1,Benin,,"ARMY, GROUND FORCES, 1ST BATALLION, COMBINED-ARMS",2001-05-07,2001-05-12,1296.0,2001.0,1,1.791759,0,...,1,0,0,0,0,0,0,0,0,0
2,Benin,,"AIR FORCE, ON TRAINING IN THE US",2001-01-22,2001-06-08,18241.0,2001.0,1,4.927254,0,...,1,0,0,0,0,0,0,0,0,0
3,Benin,,ON TRAINING IN THE US,2001-04-09,2001-08-24,13944.0,2001.0,1,4.927254,0,...,1,0,0,0,0,0,0,0,0,0
4,Benin,,"NAVY, POSTED AT GENERAL CHIEF OF STAFF HEADQUA...",2001-05-07,2001-09-21,14044.0,2001.0,1,4.927254,0,...,1,0,0,0,0,0,0,0,0,0


In [97]:
#get dummies for country
dummies = pd.get_dummies(df['country'])

df = df.drop('country',axis = 1)
# Join the encoded df
df = df.join(dummies)

In [98]:
delivery_unique = pd.DataFrame(df['deliv_unit'].unique())
delivery_unique.fillna('',inplace=True)

In [99]:
#get dummies for delivery unit
dummies = pd.get_dummies(df['deliv_unit'],prefix='deliv_u')
df= df.drop('deliv_unit',axis=1)
df = df.join(dummies)


In [100]:
df.shape

(196487, 1842)

In [101]:
df = df.drop(labels=['recip_unit','start_date','end_date'],axis=1)

In [102]:
#Split the data  
X_train, X_test, y_train, y_test = train_test_split(df.drop(labels='cost',axis=1),df['cost'],test_size = 0.3)

In [103]:
#Scale our data
scaler = StandardScaler()
scaler.fit(X_train.values)
X_train_scaled = scaler.transform(X_train.values)
X_test_scaled = scaler.transform(X_test.values)
names = X_train.columns
X_train_scaled = pd.DataFrame(X_train_scaled,columns=names)
X_train_scaled.head()


Unnamed: 0,start_year,quant_one,log_training_length,has_deliv,DOS,DOD,IMET,FMF,CTFP,Regional_Center,...,deliv_u_WJPC,deliv_u_WPC/IEAFA,"deliv_u_WRIGHT PATTERSON AFB , OH - DISAM","deliv_u_WRIGHT PATTERSON AFB , OH - DISAM - N/A","deliv_u_WRIGHT PATTERSON AFB , OH - DISCS",deliv_u_XVIII AIRBORNE CORPS (& FORT BRAGG ?),deliv_u_XVIII AIRBORNE CORPS (& FORT BRAGG ?) - N/A,deliv_u_YEMEN (IN COUNTRY TRAINING),deliv_u_YEMEN (SANA) (IN COUNTRY TRAINING),deliv_u_ZAMBIA (IN COUNTRY TRAINING)
0,1.318417,-2.753015,-0.350033,1.171321,0.965113,-0.713761,-0.820609,-0.424017,-0.322619,-0.414174,...,-0.077349,-0.009341,-0.04402,-0.015015,-0.017268,-0.009722,-0.017268,0.0,-0.013483,-0.012648
1,-0.240844,0.363238,-0.580205,-0.853737,0.965113,-0.713761,1.218607,-0.424017,-0.322619,-0.414174,...,-0.077349,-0.009341,-0.04402,-0.015015,-0.017268,-0.009722,-0.017268,0.0,-0.013483,-0.012648
2,-0.630659,-2.753015,-2.33819,-0.853737,-1.036148,1.401029,-0.820609,-0.424017,-0.322619,2.414443,...,-0.077349,-0.009341,-0.04402,-0.015015,-0.017268,-0.009722,-0.017268,0.0,-0.013483,-0.012648
3,-0.045937,-2.753015,0.017636,-0.853737,0.965113,-0.713761,-0.820609,-0.424017,-0.322619,-0.414174,...,-0.077349,-0.009341,-0.04402,-0.015015,-0.017268,-0.009722,-0.017268,0.0,-0.013483,-0.012648
4,-1.020475,0.363238,1.253138,-0.853737,0.965113,-0.713761,1.218607,-0.424017,-0.322619,-0.414174,...,-0.077349,-0.009341,-0.04402,-0.015015,-0.017268,-0.009722,-0.017268,0.0,-0.013483,-0.012648


In [104]:
X_test_scaled = pd.DataFrame(X_test_scaled,columns=names)
X_test_scaled.head()

Unnamed: 0,start_year,quant_one,log_training_length,has_deliv,DOS,DOD,IMET,FMF,CTFP,Regional_Center,...,deliv_u_WJPC,deliv_u_WPC/IEAFA,"deliv_u_WRIGHT PATTERSON AFB , OH - DISAM","deliv_u_WRIGHT PATTERSON AFB , OH - DISAM - N/A","deliv_u_WRIGHT PATTERSON AFB , OH - DISCS",deliv_u_XVIII AIRBORNE CORPS (& FORT BRAGG ?),deliv_u_XVIII AIRBORNE CORPS (& FORT BRAGG ?) - N/A,deliv_u_YEMEN (IN COUNTRY TRAINING),deliv_u_YEMEN (SANA) (IN COUNTRY TRAINING),deliv_u_ZAMBIA (IN COUNTRY TRAINING)
0,0.733694,0.363238,1.262077,1.171321,0.965113,-0.713761,-0.820609,-0.424017,-0.322619,-0.414174,...,-0.077349,-0.009341,-0.04402,-0.015015,-0.017268,-0.009722,-0.017268,0.0,-0.013483,-0.012648
1,1.513324,0.363238,0.976704,1.171321,0.965113,-0.713761,-0.820609,-0.424017,-0.322619,-0.414174,...,-0.077349,-0.009341,-0.04402,-0.015015,-0.017268,-0.009722,-0.017268,0.0,-0.013483,-0.012648
2,-1.020475,0.363238,-0.350033,-0.853737,-1.036148,-0.713761,-0.820609,2.358394,-0.322619,-0.414174,...,-0.077349,-0.009341,-0.04402,-0.015015,-0.017268,-0.009722,-0.017268,0.0,-0.013483,-0.012648
3,-1.215382,0.363238,0.707495,-0.853737,-1.036148,-0.713761,1.218607,-0.424017,-0.322619,-0.414174,...,-0.077349,-0.009341,-0.04402,-0.015015,-0.017268,-0.009722,-0.017268,0.0,-0.013483,-0.012648
4,-0.435752,0.363238,-0.226698,-0.853737,0.965113,-0.713761,-0.820609,2.358394,-0.322619,-0.414174,...,-0.077349,-0.009341,-0.04402,-0.015015,-0.017268,-0.009722,-0.017268,0.0,-0.013483,-0.012648


In [105]:
X_train.to_csv('preprocessed_data/X_train.csv',index=False)
X_test.to_csv('preprocessed_data/X_test.csv',index=False)
y_train.to_csv('preprocessed_data/y_train.csv',index=False)
y_test.to_csv('preprocessed_data/y_test.csv',index=False)

Save our data to our preprocessed data folder.