In [8]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_regression
from sklearn.preprocessing import MinMaxScaler

X, y = make_regression(n_samples=3000, n_features=3, n_informative=3, noise=5.0, random_state=42)

# 2. MAP MATH TO YOUR BUSINESS COLUMNS
# Sklearn gives us generic numbers (centered around 0). 
# We need to scale them to look like real data (Hours, Tasks, Absences).

# Scaler helps us force the sklearn numbers into specific ranges (e.g., 0 to 5 absences)
scaler_tasks = MinMaxScaler(feature_range=(20, 150)) # Tasks between 20 and 150
scaler_hours = MinMaxScaler(feature_range=(4, 12))   # Working hours between 4 and 12
scaler_absences = MinMaxScaler(feature_range=(0, 5)) # Absences between 0 and 5

# Create the specific columns from the generic X features
total_tasks = scaler_tasks.fit_transform(X[:, 0].reshape(-1, 1)).flatten().astype(int)
daily_hours = scaler_hours.fit_transform(X[:, 1].reshape(-1, 1)).flatten().astype(int)
weekly_absences = scaler_absences.fit_transform(X[:, 2].reshape(-1, 1)).flatten().astype(int)

# 3. CONSTRUCT THE 'TIME' COLUMNS (To match your schema)
# We know 'daily_hours', so we can fake login/logout times
login_time = np.random.randint(8, 11, size=3000)  # Everyone starts between 8am-10am
logout_time = login_time + daily_hours           # Logout is strictly Login + Hours

# 4. FIX THE TARGET (Productivity Score)
# Sklearn generated 'y' based on the math, but it might be -100 to +100.
# We shift it to look like a score of 0-100.
scaler_target = MinMaxScaler(feature_range=(30, 99))
productivity_score = scaler_target.fit_transform(y.reshape(-1, 1)).flatten().astype(int)

# 5. ASSEMBLE THE DATAFRAME
df = pd.DataFrame({
    'employee_id': np.arange(1, 3001),
    'login_time': login_time,
    'logout_time': logout_time,
    'total_tasks_completed': total_tasks,
    'weekly_absences': weekly_absences,
    'productivity_score': productivity_score
})

# 6. SAVE
df.to_csv('employee_productivity.csv', index=False)

print("✅ Generated 3000 rows using sklearn.make_regression")
print(df.head())

✅ Generated 3000 rows using sklearn.make_regression
   employee_id  login_time  logout_time  total_tasks_completed  \
0            1           9           16                     65   
1            2           9           17                     82   
2            3          10           17                     76   
3            4          10           17                    110   
4            5           8           14                     97   

   weekly_absences  productivity_score  
0                2                  56  
1                1                  54  
2                2                  62  
3                2                  74  
4                1                  58  


In [9]:
df.duplicated().sum()

np.int64(0)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   employee_id            3000 non-null   int64
 1   login_time             3000 non-null   int32
 2   logout_time            3000 non-null   int64
 3   total_tasks_completed  3000 non-null   int64
 4   weekly_absences        3000 non-null   int64
 5   productivity_score     3000 non-null   int64
dtypes: int32(1), int64(5)
memory usage: 129.0 KB
