In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score

# load the data
border_df = pd.read_csv('Border_Crossing_Entry_Data.csv')

# convert date to datetime and extract year 
border_df['Date'] = pd.to_datetime(border_df['Date'].str.strip(), errors='coerce')
border_df['Year'] = border_df['Date'].dt.year

# drop missing values
border_df = df.dropna()

# label encode columns
label_columns = ['Port Name', 'State', 'Measure', 'Border']
label_encoder = LabelEncoder()
for column in label_columns:
    border_df[column] = label_encoder.fit_transform(border_df[column])

# predict future increase/decrease
border_df['Prev_Year_Value'] = border_df.groupby(['Port Code', 'Measure'])['Value'].shift(1)

# drop rows with na values
border_df = border_df.dropna(subset=['Prev_Year_Value'])

# define the target variable # 1 if value increased, 0 if decreased
border_df['Value_Change'] = border_df['Value'] > border_df['Prev_Year_Value'] 
border_df['Value_Change'] = border_df['Value_Change'].astype(int)

# define features (X) and target (y)
features = ['Port Code', 'State', 'Border', 'Measure', 'Year', 'Prev_Year_Value', 'Latitude', 'Longitude']
X = border_df[features]
y = border_df['Value_Change']

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# build the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# make predictions
y_pred = model.predict(X_test)

# evaluate the model
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Classification Report:\n{classification_report(y_test, y_pred)}')

Accuracy: 0.7776420281001832
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.83      0.83     50907
           1       0.69      0.67      0.68     27669

    accuracy                           0.78     78576
   macro avg       0.76      0.75      0.76     78576
weighted avg       0.78      0.78      0.78     78576



In [9]:
# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# build the Random Forest model
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# make predictions
y_pred = model.predict(X_test)

# evaluate the model
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Classification Report:\n{classification_report(y_test, y_pred)}')

Accuracy: 0.777523247132288
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.83      0.83     76253
           1       0.69      0.68      0.68     41611

    accuracy                           0.78    117864
   macro avg       0.76      0.75      0.76    117864
weighted avg       0.78      0.78      0.78    117864



In [11]:
# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# build the Random Forest model
model = RandomForestClassifier(n_estimators=300, random_state=42)
model.fit(X_train, y_train)

# make predictions
y_pred = model.predict(X_test)

# evaluate the model
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Classification Report:\n{classification_report(y_test, y_pred)}')

Accuracy: 0.7717674608022806
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.83      0.82    127095
           1       0.68      0.67      0.67     69345

    accuracy                           0.77    196440
   macro avg       0.75      0.75      0.75    196440
weighted avg       0.77      0.77      0.77    196440

