In [1]:
#importing the pandas and numpy libraries
import pandas as pd
import numpy as np

In [2]:
#loading the data in a variable "df"
df = pd.read_csv("Data.csv")
#using head() to briefly get to know the dataset's columns
df

Unnamed: 0,Index,Age,Salary,Rating,Location,Established,Easy Apply
0,0,44.0,$44k-$99k,5.4,"India,In",1999,TRUE
1,1,66.0,$55k-$66k,3.5,"New York,Ny",2002,TRUE
2,2,,$77k-$89k,-1.0,"New York,Ny",-1,-1
3,3,64.0,$44k-$99k,4.4,India In,1988,-1
4,4,25.0,$44k-$99k,6.4,Australia Aus,2002,-1
5,5,44.0,$77k-$89k,1.4,"India,In",1999,TRUE
6,6,21.0,$44k-$99k,0.0,"New York,Ny",-1,-1
7,7,44.0,$44k-$99k,-1.0,Australia Aus,-1,-1
8,8,35.0,$44k-$99k,5.4,"New York,Ny",-1,-1
9,9,22.0,$44k-$99k,7.7,"India,In",-1,TRUE


In [3]:
df.columns = df.columns.str.lower().str.replace(' ','_')

In [4]:
#using .shape to see the number of rows and columns in the data
df.shape

(29, 7)

In [5]:
#using .index to know the start, end and step of the index of the data
df.index

RangeIndex(start=0, stop=29, step=1)

In [6]:
#using .columns to print all the column names in the data
df.columns

Index(['index', 'age', 'salary', 'rating', 'location', 'established',
       'easy_apply'],
      dtype='object')

In [7]:
#using .dtypes to know the datatype of each column in the data
df.dtypes

index            int64
age            float64
salary          object
rating         float64
location        object
established      int64
easy_apply      object
dtype: object

In [8]:
#using .nunique() to know the number of unique values in each column
df.nunique()

index          29
age            12
salary          7
rating         19
location        4
established    19
easy_apply      2
dtype: int64

In [9]:
#using .count() to know the number of values in each column
df.count()

index          29
age            22
salary         29
rating         28
location       29
established    29
easy_apply     29
dtype: int64

In [10]:
#.info() provides us woth the basic info about the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29 entries, 0 to 28
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   index        29 non-null     int64  
 1   age          22 non-null     float64
 2   salary       29 non-null     object 
 3   rating       28 non-null     float64
 4   location     29 non-null     object 
 5   established  29 non-null     int64  
 6   easy_apply   29 non-null     object 
dtypes: float64(2), int64(2), object(3)
memory usage: 1.7+ KB


## Missing Values

##### Are there any missing values in the dataset, and if so, how should they be handled for each indicator? 

In [11]:
#.isnull tells us if the value is null or not in that cell(true=null,false=not null)
print(df.isnull())

    index    age  salary  rating  location  established  easy_apply
0   False  False   False   False     False        False       False
1   False  False   False   False     False        False       False
2   False   True   False   False     False        False       False
3   False  False   False   False     False        False       False
4   False  False   False   False     False        False       False
5   False  False   False   False     False        False       False
6   False  False   False   False     False        False       False
7   False  False   False   False     False        False       False
8   False  False   False   False     False        False       False
9   False  False   False   False     False        False       False
10  False  False   False   False     False        False       False
11  False  False   False   False     False        False       False
12  False   True   False   False     False        False       False
13  False  False   False   False     False      

In [12]:
print(df[df.isnull().any(axis=1)])

    index  age     salary  rating       location  established easy_apply
2       2  NaN  $77k-$89k    -1.0    New York,Ny           -1         -1
12     12  NaN  $44k-$99k     0.0       India,In         1999         -1
17     17  NaN  $44k-$99k     5.3    New York,Ny         1943       TRUE
20     20  NaN  $44k-$99k     5.7    New York,Ny         1944       TRUE
23     23  NaN  $44k-$99k     2.4    New York,Ny         1999       TRUE
26     26  NaN  $55k-$66k     NaN       India,In         1934       TRUE
28     28  NaN  $39k-$88k     3.4  Australia Aus         1932         -1


In [13]:
# here iam using .fillna to fill the Age column missing values with their
# median
df['age'].fillna(round(df['age'].median()), inplace=True)
#here i am replacing the null rating with 0
df['rating'].fillna((0), inplace=True)
#converting it to int to not the decimal age cause it does not make sense
df['age'] = df['age'].astype(int)

##### there are 7 missing values in age column, if the dataset was large it would be a good idea to remove the rows but here the data is small so we will fill in the median values in place of the missing ones 

## Data Types

##### What are the data types of each indicator, and do they align with their expected types (e.g., numerical, categorical)?

In [14]:
df.head(3)

Unnamed: 0,index,age,salary,rating,location,established,easy_apply
0,0,44,$44k-$99k,5.4,"India,In",1999,TRUE
1,1,66,$55k-$66k,3.5,"New York,Ny",2002,TRUE
2,2,40,$77k-$89k,-1.0,"New York,Ny",-1,-1


In [15]:
#The data type of each indicator
df.dtypes

index            int64
age              int32
salary          object
rating         float64
location        object
established      int64
easy_apply      object
dtype: object

## Outliers

##### Identify potential outliers in numerical indicators (e.g., Age, Salary, Rating). Should outliers be removed or adjusted?

In [16]:
Q1 = df['age'].quantile(0.25)
Q3 = df['age'].quantile(0.75)
IQR = Q3 - Q1
outliers = df[(df['age'] < (Q1 - 1.5 * IQR)) | (df['age'] > (Q3 + 1.5 * IQR))]
outliers

Unnamed: 0,index,age,salary,rating,location,established,easy_apply
1,1,66,$55k-$66k,3.5,"New York,Ny",2002,TRUE
3,3,64,$44k-$99k,4.4,India In,1988,-1
14,14,66,$44k-$99k,4.0,Australia Aus,2020,TRUE
24,24,13,$44k-$99k,-1.0,"New York,Ny",1987,-1


## Formating Salary

##### Formatting the numerical for a better analysis

In [17]:
df['salary'] = df['salary'].str.replace('$',' ')
df['salary'] = df['salary'].str.replace('k', '000')

  df['salary'] = df['salary'].str.replace('$',' ')


## Location Standardization

In [18]:
df['location'] = df['location'].str.replace('India,In','India')
df['location'] = df['location'].str.replace('India In','India')
df['location'] = df['location'].str.replace('New York,Ny','NewYork')
df['location'] = df['location'].str.replace('Australia Aus','Australia')

## Established Column

In [20]:
df['established'] = df['established'].replace(-1,'unknown')

## Easy Apply Indicator

In [21]:
df['easy_apply'] = df['easy_apply'].map({'TRUE': True, '-1': False})

## Rating Range

In [26]:
df['rating'] = df['rating'].replace(-1,0)

## Age Distribution

In [27]:
df

Unnamed: 0,index,age,salary,rating,location,established,easy_apply
0,0,44,44000- 99000,5.4,India,1999,True
1,1,66,55000- 66000,3.5,NewYork,2002,True
2,2,40,77000- 89000,0.0,NewYork,unknown,False
3,3,64,44000- 99000,4.4,India,1988,False
4,4,25,44000- 99000,6.4,Australia,2002,False
5,5,44,77000- 89000,1.4,India,1999,True
6,6,21,44000- 99000,0.0,NewYork,unknown,False
7,7,44,44000- 99000,0.0,Australia,unknown,False
8,8,35,44000- 99000,5.4,NewYork,unknown,False
9,9,22,44000- 99000,7.7,India,unknown,True


##### There are 4 outliers in age, they can impact the average age.