In [1]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# 1. Dataset Exploration
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)

print("First 5 rows:")
print(iris_df.head())

print("\nDataset shape:")
print(iris_df.shape)

print("\nSummary statistics:")
print(iris_df.describe())

# 2. Data Splitting (Using the first two features for simplicity)
X_train, X_test, y_train, y_test = train_test_split(iris.data[:, :2], iris.target, test_size=0.2, random_state=42)

print("\nNumber of samples in training set:", len(X_train))
print("Number of samples in testing set:", len(X_test))

# 3. Linear Regression on a Mock Dataset
# Generating a mock dataset with 'YearsExperience' and 'Salary'
np.random.seed(42)
years_experience = np.random.rand(150) * 10  # Random values between 0 and 10
salary = 50000 + (years_experience * 1500) + (np.random.randn(150) * 10000)  # Linear relation with some noise

# Creating a DataFrame to simulate the CSV dataset
data = pd.DataFrame({'YearsExperience': years_experience, 'Salary': salary})

# Preparing data for regression
X = data['YearsExperience'].values.reshape(-1, 1)  # Reshape for linear regression
y = data['Salary'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print("\nMean Squared Error:", mse)

First 5 rows:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2

Dataset shape:
(150, 4)

Summary statistics:
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.000000   
mean            5.843333          3.057333           3.758000   
std             0.828066          0.435866           1.765298   
min             4.300000          2.000000           1.000000   
25%             5.100000          2.800000           1.600000   
50%             5.800000          3.000000           4.350000   
75%             6.400000     