In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt  # Correct import statement
from sklearn.model_selection import train_test_split
from sklearn import linear_model

In [104]:
np.random.seed(0)  # For reproducibility

data = {
    'name': ['Student_' + str(i) for i in range(1, 31)],
    'sex': np.random.choice(['F', 'M'], size=30),  # Categorical variable for sex
    'mindset': np.random.choice(['Fair', 'Good', 'Strong', 'Expert'], size=30),  # Categorical variable for mindset
    'math_score': np.random.randint(70, 101, size=30),
    'bio_score': np.random.randint(70, 101, size=30),
    'age': np.random.randint(16, 19, size=30),
    'student_job': np.random.choice(['Part-time', 'None'], size=30),
    'GP': np.round(np.random.uniform(2.5, 4.0, size=30), 2),
    'studytime': np.random.randint(1, 5, size=30)  # Random integer between 1 and 4 hours
}

# Create a DataFrame
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv('mydata.csv', index=True)

In [105]:
df.head()

Unnamed: 0,name,sex,mindset,math_score,bio_score,age,student_job,GP,studytime
0,Student_1,F,Good,73,81,17,Part-time,3.1,4
1,Student_2,M,Fair,100,84,17,,3.33,2
2,Student_3,M,Expert,81,88,17,Part-time,2.75,3
3,Student_4,F,Fair,88,97,16,,3.05,2
4,Student_5,M,Expert,93,70,17,,2.72,4


In [106]:
df=df[['age','bio_score', 'studytime','math_score', 'GP', 'sex', 'mindset']]
df

Unnamed: 0,age,bio_score,studytime,math_score,GP,sex,mindset
0,17,81,4,73,3.1,F,Good
1,17,84,2,100,3.33,M,Fair
2,17,88,3,81,2.75,M,Expert
3,16,97,2,88,3.05,F,Fair
4,17,70,4,93,2.72,M,Expert
5,17,84,2,98,3.35,M,Good
6,17,73,4,72,3.56,M,Strong
7,16,91,1,70,2.93,M,Expert
8,17,82,1,70,3.15,M,Expert
9,18,95,4,74,3.63,M,Fair


In [107]:
df.mindset.unique()

array(['Good', 'Fair', 'Expert', 'Strong'], dtype=object)

In [108]:
# one way to change categorial to 0 or 1 by code:
df['n_sex']=[0 if i=='F' else 1 for i in df['sex']]

In [109]:
df.head()

Unnamed: 0,age,bio_score,studytime,math_score,GP,sex,mindset,n_sex
0,17,81,4,73,3.1,F,Good,0
1,17,84,2,100,3.33,M,Fair,1
2,17,88,3,81,2.75,M,Expert,1
3,16,97,2,88,3.05,F,Fair,0
4,17,70,4,93,2.72,M,Expert,1


In [114]:
X = df.drop(columns=['GP','sex', 'mindset'])

# Define y as the 'GP' column
y = df['GP']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1)

In [116]:
X_train.shape

(27, 5)

In [117]:
model=linear_model.LinearRegression()
model.fit(X_train, y_train)
acc=model.score(X_test, y_test)

In [118]:
print(acc)

0.6662208376592977


In [119]:
model.coef_

array([ 0.00051901, -0.00833703, -0.06163764, -0.0042061 , -0.24578515])

In [120]:
model.intercept_

4.617483835825149

In [121]:
for i in range(len(predicted)):
    print(f"Real : {y_test.iloc[i]}, Predicted: {predicted[i]}")
    print("**********************************************")

Real : 2.87, Predicted: 3.3360643393784635
**********************************************
Real : 3.52, Predicted: 3.5578576952528747
**********************************************
Real : 3.33, Predicted: 3.257526941486801
**********************************************


In [122]:
epoch=30
best = 0

# Loop to find best accuracy
for i in range(epoch):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
    model = linear_model.LinearRegression()
    model.fit(X_train, y_train)
    acc = model.score(X_test, y_test)
    if acc > best:  # Correct logic to find the best accuracy
        best = acc

print(f"Best accuracy: {best}")

Best accuracy: 0.7863267573435471


### A new other way to add categorial features

In [123]:
from sklearn import preprocessing

In [129]:
myPreprocessing=preprocessing.LabelEncoder()
sex=myPreprocessing.fit_transform(list(df['sex']))
mindset=mypreprocessing.fit_transform(list(df['mindset']))

In [130]:
mindset

array([2, 1, 0, 1, 0, 2, 3, 0, 0, 1, 3, 0, 1, 2, 0, 2, 0, 0, 3, 0, 1, 2,
       2, 2, 0, 1, 0, 3, 1, 0])

In [138]:
#Now add new sex and mindset to dataframe use zip 
X = list(zip(df['age'], df['bio_score'], df['studytime'], df['math_score'], mindset, sex))
y=df['GP']


In [139]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1, random_state=1234)
model=linear_model.LinearRegression()
model.fit(X_train, y_train)
score=model.score(X_test,y_test)
score

-6.611806405150349