In [1]:
import pandas as pd
data = pd.read_csv('AirPassengers.csv')

In [2]:
data.head()

Unnamed: 0,Month,#Passengers
0,1949-01,112
1,1949-02,118
2,1949-03,132
3,1949-04,129
4,1949-05,121


In [3]:
data.dtypes

Month          object
#Passengers     int64
dtype: object

In [4]:
data['Month'] = pd.to_datetime(data['Month'], format='%Y-%m')

In [5]:
data.dtypes

Month          datetime64[ns]
#Passengers             int64
dtype: object

In [6]:
# Extract features
data['Year'] = data['Month'].dt.year
data['Month_num'] = data['Month'].dt.month
data['Sequence'] = range(1, len(data) + 1)

data.head()

Unnamed: 0,Month,#Passengers,Year,Month_num,Sequence
0,1949-01-01,112,1949,1,1
1,1949-02-01,118,1949,2,2
2,1949-03-01,132,1949,3,3
3,1949-04-01,129,1949,4,4
4,1949-05-01,121,1949,5,5


In [7]:
data.tail()

Unnamed: 0,Month,#Passengers,Year,Month_num,Sequence
139,1960-08-01,606,1960,8,140
140,1960-09-01,508,1960,9,141
141,1960-10-01,461,1960,10,142
142,1960-11-01,390,1960,11,143
143,1960-12-01,432,1960,12,144


In [8]:
from sklearn.linear_model import LinearRegression

# Define features and target
X = data[['Year', 'Month_num', 'Sequence']]
y = data['#Passengers']

# Create a linear regression model
model = LinearRegression()

# Fit the model
model.fit(X, y)

In [9]:
# Starting point after the last known month in the data
start_date = '1961-01'

# Generate a date range for the next 36 months
dates = pd.date_range(start=start_date, periods=36, freq='MS')  # 'MS' is month start frequency

# Extract year and month number from the date range
years = dates.year
month_nums = dates.month

# Calculate sequence numbers, starting from 145 (144 + 1)
sequences = range(145, 145 + 36)

# Create a new DataFrame for prediction
prediction_data = pd.DataFrame({
    'Year': years,
    'Month_num': month_nums,
    'Sequence': sequences
})

# Display the DataFrame
prediction_data.head()

Unnamed: 0,Year,Month_num,Sequence
0,1961,1,145
1,1961,2,146
2,1961,3,147
3,1961,4,148
4,1961,5,149


In [10]:
predictions = model.predict(prediction_data)
predictions

array([475.68089549, 477.88514957, 480.08940365, 482.29365773,
       484.49791181, 486.70216589, 488.90641997, 491.11067405,
       493.31492813, 495.51918221, 497.72343629, 499.92769037,
       507.60484654, 509.80910062, 512.0133547 , 514.21760878,
       516.42186286, 518.62611694, 520.83037102, 523.0346251 ,
       525.23887918, 527.44313326, 529.64738733, 531.85164141,
       539.52879759, 541.73305167, 543.93730575, 546.14155983,
       548.34581391, 550.55006799, 552.75432207, 554.95857615,
       557.16283023, 559.3670843 , 561.57133838, 563.77559246])

In [11]:
# Generate dates for the predictions
prediction_dates = pd.date_range(start='1961-01-01', periods=len(predictions), freq='MS')

# Format dates as 'YYYY-MM'
formatted_dates = prediction_dates.strftime('%Y-%m')

# Create a DataFrame for the predictions
predictions_df = pd.DataFrame({
    'Month': formatted_dates,
    'Forecast_Passengers': predictions
})

predictions_df.head()

Unnamed: 0,Month,Forecast_Passengers
0,1961-01,475.680895
1,1961-02,477.88515
2,1961-03,480.089404
3,1961-04,482.293658
4,1961-05,484.497912


In [13]:
import plotly.graph_objects as go

# Convert the 'Month' in data to match predictions_df format
data['Month'] = pd.to_datetime(data['Month']).dt.strftime('%Y-%m')

# Combine historical data and predictions
combined_data = pd.concat([
    data.set_index('Month'),
    predictions_df.set_index('Month')
], axis=0)

# Plotting
fig = go.Figure()

# Add historical data to the plot
fig.add_trace(go.Scatter(x=combined_data.index[:len(data)], y=combined_data['#Passengers'][:len(data)],
                         mode='lines+markers', name='Actual Passengers'))

# Add forecast data to the plot
fig.add_trace(go.Scatter(x=combined_data.index[len(data):], y=combined_data['Forecast_Passengers'][len(data):],
                         mode='lines+markers', name='Forecasted Passengers'))

# Update plot layout
fig.update_layout(
    title='Airline Passenger Numbers: Historical and Forecasted',
    xaxis_title='Date',
    yaxis_title='Number of Passengers',
    legend_title='Legend',
    xaxis=dict(
        tickmode='array',
        tickvals=combined_data.index[::12],  # Show only one label per year or adjust as needed
        ticktext=[date if i % 12 == 0 else '' for i, date in enumerate(combined_data.index)],  # Label formatting
    )
)

# Show the plot
fig.show()

# Treating Month_Num as Categorical

In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define the features and target
#X = data[['Year', 'Month_num', 'Sequence']]
#y = data['#Passengers']

# Define the preprocessing for the Month_num column (one-hot encoding)
preprocessor = ColumnTransformer(
    transformers=[
        ('month_num', OneHotEncoder(), ['Month_num'])  # Apply OneHotEncoder to Month_num
    ],
    remainder='passthrough'  # Keep the rest of the columns untouched
)

# Create the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Preprocessing step
    ('regressor', LinearRegression())  # The regression model
])

# Fit the pipeline
pipeline.fit(X, y)

In [18]:
predictions = pipeline.predict(prediction_data)
predictions

array([449.25564447, 442.50564396, 477.67231011, 474.58897627,
       479.33897575, 519.17230858, 558.83897473, 558.58897422,
       509.92230704, 474.0889732 , 440.33897269, 469.33897217,
       481.17958934, 474.42958883, 509.59625498, 506.51292114,
       511.26292062, 551.09625345, 590.7629196 , 590.51291909,
       541.84625191, 506.01291807, 472.26291756, 501.26291704,
       513.10353421, 506.3535337 , 541.52019985, 538.43686601,
       543.18686549, 583.02019832, 622.68686447, 622.43686396,
       573.77019678, 537.93686294, 504.18686243, 533.18686191])

In [19]:
# Generate dates for the predictions
prediction_dates = pd.date_range(start='1961-01-01', periods=len(predictions), freq='MS')

# Format dates as 'YYYY-MM'
formatted_dates = prediction_dates.strftime('%Y-%m')

# Create a DataFrame for the predictions
predictions_df = pd.DataFrame({
    'Month': formatted_dates,
    'Forecast_Passengers': predictions
})

predictions_df.head()

Unnamed: 0,Month,Forecast_Passengers
0,1961-01,449.255644
1,1961-02,442.505644
2,1961-03,477.67231
3,1961-04,474.588976
4,1961-05,479.338976


In [20]:
import plotly.graph_objects as go

# Convert the 'Month' in data to match predictions_df format
data['Month'] = pd.to_datetime(data['Month']).dt.strftime('%Y-%m')

# Combine historical data and predictions
combined_data = pd.concat([
    data.set_index('Month'),
    predictions_df.set_index('Month')
], axis=0)

# Plotting
fig = go.Figure()

# Add historical data to the plot
fig.add_trace(go.Scatter(x=combined_data.index[:len(data)], y=combined_data['#Passengers'][:len(data)],
                         mode='lines+markers', name='Actual Passengers'))

# Add forecast data to the plot
fig.add_trace(go.Scatter(x=combined_data.index[len(data):], y=combined_data['Forecast_Passengers'][len(data):],
                         mode='lines+markers', name='Forecasted Passengers'))

# Update plot layout
fig.update_layout(
    title='Airline Passenger Numbers: Historical and Forecasted',
    xaxis_title='Date',
    yaxis_title='Number of Passengers',
    legend_title='Legend',
    xaxis=dict(
        tickmode='array',
        tickvals=combined_data.index[::12],  # Show only one label per year or adjust as needed
        ticktext=[date if i % 12 == 0 else '' for i, date in enumerate(combined_data.index)],  # Label formatting
    )
)

# Show the plot
fig.show()

# Polynomial Regression

In [56]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define features and target (make sure X and y are defined correctly with your data)
#X = data[['Year', 'Month_num', 'Sequence']]
#y = data['#Passengers']

# Define the preprocessing for numerical features to include polynomial features
# Assuming that 'Year' and 'Sequence' are numerical and you want to include polynomial features for them
numerical_features = ['Year', 'Sequence']
numerical_transformer = Pipeline(steps=[
    ('poly', PolynomialFeatures(degree=2))
])

# Define the complete preprocessing which includes one-hot encoding and polynomial features
preprocessor = ColumnTransformer(
    transformers=[
        ('month_num', OneHotEncoder(), ['Month_num']),  # Apply OneHotEncoder to Month_num
        ('num', numerical_transformer, numerical_features)  # Apply polynomial features to numerical columns
    ],
    remainder='passthrough'  # Keep the rest of the columns untouched
)

# Create the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Preprocessing step
    ('regressor', LinearRegression())  # The regression model
])

# Fit the pipeline
pipeline.fit(X, y)

# The pipeline is now ready to be used for predictions with both one-hot encoded and polynomial features


In [57]:
predictions = pipeline.predict(prediction_data)
predictions

array([466.06910451, 461.70093858, 499.24943932, 498.54794007,
       505.67977414, 547.89494155, 589.94344229, 592.07527637,
       545.79044377, 512.33894452, 480.97077859, 512.35261267,
       509.78389292, 505.78216301, 543.69709976, 543.36203652,
       550.8603066 , 593.44191002, 635.85684678, 638.35511686,
       592.43672028, 559.35165704, 528.34992712, 560.09819721,
       555.47100069, 551.83570679, 590.11707955, 590.14845232,
       598.01315841, 640.96119784, 683.74257061, 686.60727671,
       641.05531614, 608.3366889 , 577.701395  , 609.8161011 ])

In [58]:
# Generate dates for the predictions
prediction_dates = pd.date_range(start='1961-01-01', periods=len(predictions), freq='MS')

# Format dates as 'YYYY-MM'
formatted_dates = prediction_dates.strftime('%Y-%m')

# Create a DataFrame for the predictions
predictions_df = pd.DataFrame({
    'Month': formatted_dates,
    'Forecast_Passengers': predictions
})

predictions_df.head()

Unnamed: 0,Month,Forecast_Passengers
0,1961-01,466.069105
1,1961-02,461.700939
2,1961-03,499.249439
3,1961-04,498.54794
4,1961-05,505.679774


In [59]:
import plotly.graph_objects as go

# Convert the 'Month' in data to match predictions_df format
data['Month'] = pd.to_datetime(data['Month']).dt.strftime('%Y-%m')

# Combine historical data and predictions
combined_data = pd.concat([
    data.set_index('Month'),
    predictions_df.set_index('Month')
], axis=0)

# Plotting
fig = go.Figure()

# Add historical data to the plot
fig.add_trace(go.Scatter(x=combined_data.index[:len(data)], y=combined_data['#Passengers'][:len(data)],
                         mode='lines+markers', name='Actual Passengers'))

# Add forecast data to the plot
fig.add_trace(go.Scatter(x=combined_data.index[len(data):], y=combined_data['Forecast_Passengers'][len(data):],
                         mode='lines+markers', name='Forecasted Passengers'))

# Update plot layout
fig.update_layout(
    title='Airline Passenger Numbers: Historical and Forecasted',
    xaxis_title='Date',
    yaxis_title='Number of Passengers',
    legend_title='Legend',
    xaxis=dict(
        tickmode='array',
        tickvals=combined_data.index[::12],  # Show only one label per year or adjust as needed
        ticktext=[date if i % 12 == 0 else '' for i, date in enumerate(combined_data.index)],  # Label formatting
    )
)

# Show the plot
fig.show()