In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Configuración para gráficos
%matplotlib inline
sns.set(style="whitegrid")

# Cargar el dataset
df = pd.read_csv('Salary Data.csv')

# Ver las primeras filas para confirmar que se cargó bien
df.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375 entries, 0 to 374
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  373 non-null    float64
 1   Gender               373 non-null    object 
 2   Education Level      373 non-null    object 
 3   Job Title            373 non-null    object 
 4   Years of Experience  373 non-null    float64
 5   Salary               373 non-null    float64
dtypes: float64(3), object(3)
memory usage: 17.7+ KB


In [3]:
# Contar los valores únicos en cada columna categórica
for col in ('Gender', 'Education Level', 'Job Title'):
    print(f'en la columna {col} hay {df[col].nunique()} valores únicos.')

en la columna Gender hay 2 valores únicos.
en la columna Education Level hay 3 valores únicos.
en la columna Job Title hay 174 valores únicos.


In [4]:
# Seleccionar columnas
x  = df.drop(['Gender', 'Job Title'], axis = 1)
x.head()

Unnamed: 0,Age,Education Level,Years of Experience,Salary
0,32.0,Bachelor's,5.0,90000.0
1,28.0,Master's,3.0,65000.0
2,45.0,PhD,15.0,150000.0
3,36.0,Bachelor's,7.0,60000.0
4,52.0,Master's,20.0,200000.0


In [5]:
# Seleccionar filas y columnas en Pandas mediante posición con iloc
print(df.iloc[0]) # Primera fila
df.iloc[1] # Segunda fila
dfa = df.iloc[-1] # Última fila
dfa.head()

Age                                 32.0
Gender                              Male
Education Level               Bachelor's
Job Title              Software Engineer
Years of Experience                  5.0
Salary                           90000.0
Name: 0, dtype: object


Age                                       44.0
Gender                                  Female
Education Level                            PhD
Job Title              Senior Business Analyst
Years of Experience                       15.0
Name: 374, dtype: object

In [6]:
print(df.head())
df.iloc[0:5] # Primeras cinco filas
df.iloc[:, 0:5] # Primeras cinco columnas
df.iloc[[0,2,1]]  # Primera, tercera y segunda filas
df.iloc[:, [0,2,1]]  # Primera, tercera y segunda columnas

    Age  Gender Education Level          Job Title  Years of Experience  \
0  32.0    Male      Bachelor's  Software Engineer                  5.0   
1  28.0  Female        Master's       Data Analyst                  3.0   
2  45.0    Male             PhD     Senior Manager                 15.0   
3  36.0  Female      Bachelor's    Sales Associate                  7.0   
4  52.0    Male        Master's           Director                 20.0   

     Salary  
0   90000.0  
1   65000.0  
2  150000.0  
3   60000.0  
4  200000.0  


Unnamed: 0,Age,Education Level,Gender
0,32.0,Bachelor's,Male
1,28.0,Master's,Female
2,45.0,PhD,Male
3,36.0,Bachelor's,Female
4,52.0,Master's,Male
...,...,...,...
370,35.0,Bachelor's,Female
371,43.0,Master's,Male
372,29.0,Bachelor's,Female
373,34.0,Bachelor's,Male


In [7]:
# Seleccionar filas y columnas en Pandas en base a etiquetas con loc
df.head()
df_sub = df.loc[1:4]
df_sub

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


In [8]:
print(df_sub.loc[1])
print('')
print(df_sub.iloc[1])

Age                            28.0
Gender                       Female
Education Level            Master's
Job Title              Data Analyst
Years of Experience             3.0
Salary                      65000.0
Name: 1, dtype: object

Age                              45.0
Gender                           Male
Education Level                   PhD
Job Title              Senior Manager
Years of Experience              15.0
Salary                       150000.0
Name: 2, dtype: object


In [9]:
# Seleccionar Columnas
df.loc[:, 'Gender'] # Columna Gender

df.loc[:, ['Gender', 'Salary']] #re Columnas Gender, Salary

Unnamed: 0,Gender,Salary
0,Male,90000.0
1,Female,65000.0
2,Male,150000.0
3,Female,60000.0
4,Male,200000.0
...,...,...
370,Female,85000.0
371,Male,170000.0
372,Female,40000.0
373,Male,90000.0


In [10]:
# Seleccionar filas o columnas en base a una condición con loc
is_male = df.loc[:, 'Gender'] == 'Male'
df_male = df.loc[is_male]
df_male.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
4,52.0,Male,Master's,Director,20.0,200000.0
5,29.0,Male,Bachelor's,Marketing Analyst,2.0,55000.0
7,31.0,Male,Bachelor's,Sales Manager,4.0,80000.0


In [11]:
# Problemas en los datos
# Información general del DataFrame
print("Información general del DataFrame:")
print(df.info())

Información general del DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375 entries, 0 to 374
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  373 non-null    float64
 1   Gender               373 non-null    object 
 2   Education Level      373 non-null    object 
 3   Job Title            373 non-null    object 
 4   Years of Experience  373 non-null    float64
 5   Salary               373 non-null    float64
dtypes: float64(3), object(3)
memory usage: 17.7+ KB
None


In [12]:
# Verificar valores faltantes
print("\nValores faltantes (NaN) por columna:")
print(df.isnull().sum())


Valores faltantes (NaN) por columna:
Age                    2
Gender                 2
Education Level        2
Job Title              2
Years of Experience    2
Salary                 2
dtype: int64


In [13]:
DFN = df.fillna(0)


print(DFN.isnull().sum())

Age                    0
Gender                 0
Education Level        0
Job Title              0
Years of Experience    0
Salary                 0
dtype: int64


In [14]:
col = 'Gender'

moda = df[col].mode()[0]

print(moda)
df[col] = df[col].fillna(moda)
print(df.isnull().sum())

Male
Age                    2
Gender                 0
Education Level        2
Job Title              2
Years of Experience    2
Salary                 2
dtype: int64


In [16]:
# Estadisticas descriptivas
print("\nEstadísticas descriptivas:")
print(df.describe())


Estadísticas descriptivas:
              Age  Years of Experience         Salary
count  373.000000           373.000000     373.000000
mean    37.431635            10.030831  100577.345845
std      7.069073             6.557007   48240.013482
min     23.000000             0.000000     350.000000
25%     31.000000             4.000000   55000.000000
50%     36.000000             9.000000   95000.000000
75%     44.000000            15.000000  140000.000000
max     53.000000            25.000000  250000.000000


In [17]:
# Verificar tipos de datos únicos en columnas categóricas

print("\nValores únicos en columnas categóricas:")
for col in ['Gender', 'Education Level']:#, 'Job Title']:
    print(f"\n{col}: {df[col].unique()}")


Valores únicos en columnas categóricas:

Gender: ['Male' 'Female']

Education Level: ["Bachelor's" "Master's" 'PhD']


In [18]:
# Verificar duplicados
print(f"\nNúmero de filas duplicadas: {df.duplicated().sum()}")


Número de filas duplicadas: 50


In [19]:
df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
370     True
371    False
372     True
373     True
374     True
Length: 375, dtype: bool

In [20]:
dato_370 = df.iloc[369] 
print(dato_370)

Age                                       33.0
Gender                                    Male
Education Level                     Bachelor's
Job Title              Junior Business Analyst
Years of Experience                        4.0
Salary                                 60000.0
Name: 369, dtype: object


In [21]:
df_new = df.drop_duplicates()
df_new

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0
...,...,...,...,...,...,...
348,28.0,Female,Bachelor's,Junior Operations Manager,1.0,35000.0
349,36.0,Male,Bachelor's,Senior Business Development Manager,8.0,110000.0
350,44.0,Female,PhD,Senior Data Scientist,16.0,160000.0
351,31.0,Male,Bachelor's,Junior Marketing Coordinator,3.0,55000.0


In [None]:
# Dummies variables