In [None]:
pip install pandas scikit-learn matplotlib


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
url = 'https://raw.githubusercontent.com/dsrscientist/Data-Science-ML-Capstone-Projects/master/baseball.csv'
df = pd.read_csv(url)

# Explore the dataset
print(df.head())  # Display the first few rows of the dataset

# Check for missing values
print(df.isnull().sum())

# Separate the features (X) and the target (y)
X = df.drop(columns=['W'], axis=1)
y = df['W']

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Linear Regression model
model = LinearRegression()

# Train the model using the training data
model.fit(X_train, y_train)

# Make predictions using the test data
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

# Visualize the predicted vs. actual wins
plt.scatter(y_test, y_pred)
plt.xlabel("Actual Wins")
plt.ylabel("Predicted Wins")
plt.title("Actual Wins vs. Predicted Wins")
plt.show()
