# Analyze the impact of socio-economic factors like IncomeM and Schoolyears on the number of miscarriages, which could be indicative of reproductive health issues.

# regression model

 Load and Initial Data Check

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
data = pd.read_csv('FedCycleData.csv')

# Convert to numeric and handle missing values
data['IncomeM'] = pd.to_numeric(data['IncomeM'], errors='coerce')
data['Schoolyears'] = pd.to_numeric(data['Schoolyears'], errors='coerce')
data['Miscarriages'] = pd.to_numeric(data['Miscarriages'], errors='coerce')

# Filling missing values with the median
data.fillna({
    'IncomeM': data['IncomeM'].median(),
    'Schoolyears': data['Schoolyears'].median(),
    'Miscarriages': data['Miscarriages'].median()
}, inplace=True)

# Selecting features and target
X = data[['IncomeM', 'Schoolyears']]
y = data['Miscarriages']


Split Data and Build the Model

In [10]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")


Mean Squared Error: 0.027368024773740784
R^2 Score: 0.1787293009920542


In [11]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize and train the Decision Tree model
tree_model = DecisionTreeRegressor(max_depth=5)  # Limiting depth to prevent overfitting
tree_model.fit(X_train, y_train)

# Predict on the test set
y_pred_tree = tree_model.predict(X_test)

# Evaluate the model
mse_tree = mean_squared_error(y_test, y_pred_tree)
r2_tree = r2_score(y_test, y_pred_tree)
print(f"Decision Tree - Mean Squared Error: {mse_tree}")
print(f"Decision Tree - R^2 Score: {r2_tree}")


Decision Tree - Mean Squared Error: 0.0391451815298264
Decision Tree - R^2 Score: -0.17468435751489642


In [12]:
from sklearn.ensemble import RandomForestRegressor

# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest - Mean Squared Error: {mse_rf}")
print(f"Random Forest - R^2 Score: {r2_rf}")


Random Forest - Mean Squared Error: 0.035727062281613146
Random Forest - R^2 Score: -0.07211205982514546


Although better than Decision Trees and Gradient Boosting, the negative R² score indicates that the model does not effectively capture the underlying pattern and is worse than a simple mean model.

In [13]:
from sklearn.ensemble import GradientBoostingRegressor

# Initialize and train the Gradient Boosting model
gbm_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbm_model.fit(X_train, y_train)

# Predict on the test set
y_pred_gbm = gbm_model.predict(X_test)

# Evaluate the model
mse_gbm = mean_squared_error(y_test, y_pred_gbm)
r2_gbm = r2_score(y_test, y_pred_gbm)
print(f"Gradient Boosting - Mean Squared Error: {mse_gbm}")
print(f"Gradient Boosting - R^2 Score: {r2_gbm}")


Gradient Boosting - Mean Squared Error: 0.037218761008842784
Gradient Boosting - R^2 Score: -0.11687555542080141


Similar to Random Forest, this model also fails to provide a positive explanation of the variance, suggesting that the sequential corrections of residuals in Gradient Boosting did not align well with the patterns in the data for this specific case.

In [15]:
from sklearn.preprocessing import PolynomialFeatures

# Create polynomial features
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Fit the linear model on polynomial features
poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)

# Predict and evaluate
y_pred_poly = poly_model.predict(X_test_poly)
mse_poly = mean_squared_error(y_test, y_pred_poly)
r2_poly = r2_score(y_test, y_pred_poly)
print(f"Polynomial Regression - Mean Squared Error: {mse_poly}")
print(f"Polynomial Regression - R^2 Score: {r2_poly}")


Polynomial Regression - Mean Squared Error: 0.02617117772529583
Polynomial Regression - R^2 Score: 0.2146447687763824


This model performed the best, suggesting that non-linear relationships between the predictors (Income and Schoolyears) and the number of miscarriages may exist. This indicates that incorporating non-linear terms improves the model's ability to explain the variability in the number of miscarriages.