In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error

In [2]:
data = pd.read_csv('Metro.csv')

In [3]:
df = pd.DataFrame(data)

In [4]:
#df['Date'] = pd.to_datetime(df['Date'], format='mixed')

In [5]:
df['Entry'] = pd.to_numeric(df['Entry'], errors='coerce')
df['Exit'] = pd.to_numeric(df['Exit'], errors='coerce')
df['Entry and Exit'] = pd.to_numeric(df['Entry and Exit'], errors='coerce')

In [6]:
df.dropna(subset=['Entry'], inplace=True)
df.dropna(subset=['Exit'], inplace=True)
df.dropna(subset=['Entry and Exit'], inplace=True)

In [7]:
df['Entry'] = df['Entry'].astype(int)
df['Exit'] = df['Exit'].astype(int)
df['Entry and Exit'] = df['Entry and Exit'].astype(int)

In [8]:
df['Period_Start'] = df['Period'].apply(lambda x: x.split(' - ')[0])
df['Period_End'] = df['Period'].apply(lambda x: x.split(' - ')[1] if len(x.split(' - ')) > 1 else None)
df.drop('Period', axis=1, inplace=True)

In [9]:
df = df.dropna()

# Drop columns with any missing values
df= df.dropna(axis=1)

In [10]:
def time_to_minutes(time_str):
    if time_str is None:
        return None
    # Split the string by ' - ' to get only the start time
    start_time_str = time_str.split(' - ')[0]
    # Split the start time by ':' to extract hours and minutes
    h, m = map(int, start_time_str.split('-')[0].split(':'))
    return h * 60 + m

In [11]:
df['Period_Start'] = df['Period_Start'].apply(time_to_minutes)
df['Period_End'] = df['Period_End'].apply(time_to_minutes) if 'Period_End' in df.columns else None

In [12]:
X = df.drop(columns=['Entry and Exit'])
y = df['Entry and Exit']


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
from sklearn.preprocessing import StandardScaler

# Preprocessing
categorical_features = ['Date', 'Station']
numerical_features = ['Entry', 'Exit', 'Period_Start', 'Period_End']

# One-hot encode categorical features and scale numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)  # Use drop='first' to avoid dummy variable trap
    ])


In [15]:


# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [16]:
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 1447.945777241509


In [21]:
new_data = pd.DataFrame({
    'Date': ['5/10/2023'],
    'Station': ['Kazipara'],
    'Entry': [711],
    'Exit': [201],
    'Period_Start': [time_to_minutes('20:30')],
    'Period_End': [time_to_minutes('20:44')]
})

In [22]:
predicted_entry_exit = model.predict(new_data)
print(f'Predicted Entry and Exit: {predicted_entry_exit[0]}')

Predicted Entry and Exit: 911.9858592336305
