In [None]:
!pip install scikit-learn

In [None]:
# import pandas 
import pandas as pd

# Read the data using csv
data = pd.read_csv('../Data/employee.csv')

# See initial 5 records
data.head()

In [None]:
# See last 5 records
data.tail()

In [None]:
# Print list of columns in the data
print(data.columns)

In [None]:
# Print the shape of a DataFrame
print(data.shape)

In [None]:
# Check the information of DataFrame
data.info()

In [None]:
# Check the descriptive statistics
data.describe()

In [None]:
# Pandas Dataframe
# Filter columns 
data.filter(['name', 'department'])

In [None]:
# Pandas Series
# Filter column “name”
data['name']

In [None]:
# Pandas Dataframe, because the category name was entered in a list
# Filter column “name” 
data[['name']]

In [None]:
# Filter two columns: name and department
data[['name','department']]

In [None]:
# Select rows for specific index
data.filter([0,1,2],axis=0)

In [None]:
# Filter data using slicing
data[2:5]

In [None]:
# Filter data for specific value 
data[data.department=='Sales']

In [None]:
# Select data for multiple values
data[data.department.isin(['Sales','Finance'])]

In [None]:
# Filter employee who has more than 700 performance score
data[(data.performance_score >=700)]

In [None]:
# Filter employee who has more than 500 and less than 700 performance score
data[(data.performance_score >=500) & (data.performance_score < 700)]

In [None]:
# Filter employee who has performance score less than 500
data.query('performance_score<500')

In [None]:
# Drop missing value rows using dropna() function
# Read the data
data=pd.read_csv('employee.csv')
data=data.dropna()
data

In [None]:
# Read the data
data=pd.read_csv('employee.csv')

# Fill all the missing values in the age column with mean of the age column
data['age']=data.age.fillna(data.age.mean())
data

In [None]:
# Fill all the missing values in the income column with a median of the income column
data['income']=data.income.fillna(data.income.median())
data


In [None]:
# Fill all the missing values in the gender column(category column) with the mode of the gender column
data['gender']=data['gender'].fillna(data['gender'].mode()[0])
data

In [None]:
# Read the data
data=pd.read_csv('employee.csv')

# Dropping the outliers using Standard Deviation
upper_limit= data['performance_score'].mean () + 3 * data['performance_score'].std ()
lower_limit = data['performance_score'].mean () - 3 * data['performance_score'].std () 
data = data[(data['performance_score'] < upper_limit) & (data['performance_score'] > lower_limit)]
data

In [None]:
# Read the data
data=pd.read_csv('employee.csv')

# Drop the outlier observations using Percentiles
upper_limit = data['performance_score'].quantile(.99)
lower_limit = data['performance_score'].quantile(.01)
data = data[(data['performance_score'] < upper_limit) & (data['performance_score'] > lower_limit)]
data

In [None]:
# Read the data
data=pd.read_csv('../Data/employee.csv')
# Dummy encoding
encoded_data = pd.get_dummies(data['gender'])

# Join the encoded _data with original dataframe
data = data.join(encoded_data)

# Check the top-5 records of the dataframe
data.head()

In [None]:
# Import one hot encoder  
from sklearn.preprocessing import OneHotEncoder 

data = pd.read_csv('../Data/employee.csv')
  
# Initialize the one hot encoder object
onehotencoder = OneHotEncoder() 

# Fill all the missing values in income column(category column) with mode of age column
data['gender']=data['gender'].fillna(data['gender'].mode()[0])
print(f"data: \n{data}")

# Fit and transforms the gender column
onehotencoder.fit_transform(data[['gender']]).toarray()

In [None]:
# Import pandas  
import pandas as pd
# Read the data
data=pd.read_csv('../Data/employee.csv')
# Import LabelEncoder
from sklearn.preprocessing import LabelEncoder
# Instantiate the Label Encoder Object
label_encoder = LabelEncoder()
# Fit and transform the column
encoded_data = label_encoder.fit_transform(data['department'])
# Print the encoded
print(encoded_data) 

In [None]:
# Perform inverse encoding
inverse_encode=label_encoder.inverse_transform([0, 0, 1, 2])
# Print inverse encode
print(inverse_encode) 

In [None]:
# Import pandas and OrdinalEncoder
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

# Load the data
data=pd.read_csv('../Data/employee.csv')

# Initialize OrdinalEncoder with order 
order_encoder=OrdinalEncoder(categories=['G0','G1','G2','G3','G4'])

# fit and transform the grade 
data['grade_encoded'] = label_encoder.fit_transform(data['grade'])

# Check top-5 records of the dataframe
data.head()

In [None]:
# Import StandardScaler(or z-score normalization) 
from sklearn.preprocessing import StandardScaler 
  
# Initialize the StandardScaler 
scaler = StandardScaler() 
  
# To scale data 
scaler.fit(data['performance_score'].values.reshape(-1,1)) 
data['performance_std_scaler']=scaler.transform(data['performance_score'].values.reshape(-1,1))
data.head()

In [None]:
# Import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

# Initialise the MinMaxScaler 
scaler = MinMaxScaler()

# To scale data 
scaler.fit(data['performance_score'].values.reshape(-1,1)) 
data['performance_minmax_scaler']=scaler.transform(data['performance_score'].values.reshape(-1,1))
data.head()

In [None]:
# Import RobustScaler
from sklearn.preprocessing import RobustScaler

# Initialise the RobustScaler 
scaler = RobustScaler()

# To scale data 
scaler.fit(data['performance_score'].values.reshape(-1,1)) 
data['performance_robust_scaler']=scaler.transform(data['performance_score'].values.reshape(-1,1))
# See initial 5 records
data.head()

In [None]:
# Read the data
data=pd.read_csv('../Data/employee.csv')
# Create performance grade function 
def performance_grade(score):
    if score>=700:
        return 'A'
    elif score<700 and score >= 500:
        return 'B'
    else:
        return 'C'
# Apply performance grade function on whole DataFrame using apply() function.    
data['performance_grade']=data.performance_score.apply(performance_grade)    
# See initial 5 records
data.head()

In [None]:
# Split the name column in first and last name
data['first_name']=data.name.str.split(" ").map(lambda var: var[0])
data['last_name']=data.name.str.split(" ").map(lambda var: var[1])
# Check top-5 records 
data.head()