Pandas is a Python library used for:

Working with tabular data (like Excel sheets or SQL tables)
Data cleaning, transformation, and analysis
Making your data ready for machine learning or visualization
Think of pandas as Excel in Python — but faster, more powerful, and programmable.

Series:

A Series is basically a single column of data with labels (called an index).

Think of it like a list in Python, but smarter. It has:

1 - Values (like numbers, names, etc.)
2 - Indexes (labels or row numbers)

Imagine an Excel column labeled "Age" with values like this:

Index	Age
0	    25
1	    30
2	    35
This column(Age) is a Series.


In [3]:
# Series

import pandas as pd

data = [1,2,3,4,5]
s = pd.Series(data)
print("Series \n", s)
print(type(s))

Series 
 0    1
1    2
2    3
3    4
4    5
dtype: int64
<class 'pandas.core.series.Series'>


In [22]:
# Create Series from dictionary

data_dict = {'a': 1, 'b':2, 'c': 3}
s_dict = pd.Series(data_dict)
print("Series: \n", s_dict)

Series: 
 a    1
b    2
c    3
dtype: int64


In [23]:
#Create custom index series

data = [1,2,3,4,5]
index = ['New York', 'Los Angeles', 'Chicago', 'Dallas', 'Seattle']
s = pd.Series(data, index = index)
print("Series:\n", s)

Series:
 New York       1
Los Angeles    2
Chicago        3
Dallas         4
Seattle        5
dtype: int64


In [None]:
# Create dataframe from dictionary

import pandas as pd

data = {
    "Name:" : ["David", "Jack", "John", "Mary"],
    "Age:" : [23, 35, 32, 27],
    "City:" : ["New York", "Los Angeles", "Chicago", "Houston"]
}
df = pd.DataFrame(data)

print("data:", df)

data:    Name:  Age:        City:
0  David    23     New York
1   Jack    35  Los Angeles
2   John    32      Chicago
3   Mary    27      Houston


In [26]:
# Covert Dataframe to Numpy array

import pandas as pd

data = {
    "Name:" : ["David", "Jack", "John", "Mary"],
    "Age:" : [23, 35, 32, 27],
    "City:" : ["New York", "Los Angeles", "Chicago", "Houston"]
}
df = pd.DataFrame(data)

import numpy as np
array = df.to_numpy()
print(array)

[['David' 23 'New York']
 ['Jack' 35 'Los Angeles']
 ['John' 32 'Chicago']
 ['Mary' 27 'Houston']]


In [27]:
# Create dataframe from list of dictionary

data = [
    {'Name:' : 'David', 'Age': 23, 'City': 'New York'},
    {'Name:' : 'Jack', 'Age': 35, 'City': 'Los Angeles'},
    {'Name:' : 'John', 'Age': 32, 'City': 'Chicago'},
    {'Name:' : 'Mary', 'Age': 27, 'City': 'Houston'}
]

df = pd.DataFrame(data)
print("DataFrame:\n", df)

DataFrame:
    Name:  Age         City
0  David   23     New York
1   Jack   35  Los Angeles
2   John   32      Chicago
3   Mary   27      Houston


In [28]:
# df.loc
#.loc is used to:
# Access rows and columns by label (not by number/index position)
# Select a specific row, range of rows, or specific cell
# Filter or update values based on conditions

data = {
    "Age": [25, 30, 35],
    "City": ["New York", "San Francisco", "Los Angeles"]
}

df = pd.DataFrame(data, index=["Alice", "Bob", "Charlie"])
print(df)

df.loc["Alice"]

         Age           City
Alice     25       New York
Bob       30  San Francisco
Charlie   35    Los Angeles


Age           25
City    New York
Name: Alice, dtype: object

In [4]:
#Read data from CSV/Excel file

data = pd.read_csv("/Users/goley/Documents/Learn/ML, AI/Files/personality_dataset.csv")
print(data.info)
print(data.columns)

<bound method DataFrame.info of       Time_spent_Alone Stage_fear  Social_event_attendance  Going_outside  \
0                  4.0         No                      4.0            6.0   
1                  9.0        Yes                      0.0            0.0   
2                  9.0        Yes                      1.0            2.0   
3                  0.0         No                      6.0            7.0   
4                  3.0         No                      9.0            4.0   
...                ...        ...                      ...            ...   
2895               3.0         No                      7.0            6.0   
2896               3.0         No                      8.0            3.0   
2897               4.0        Yes                      1.0            1.0   
2898              11.0        Yes                      1.0            NaN   
2899               3.0         No                      6.0            6.0   

     Drained_after_socializing  Friends_cir

In [9]:
print(data.head())
print(data.head(10))
print(data.tail())

   Time_spent_Alone Stage_fear  Social_event_attendance  Going_outside  \
0               4.0         No                      4.0            6.0   
1               9.0        Yes                      0.0            0.0   
2               9.0        Yes                      1.0            2.0   
3               0.0         No                      6.0            7.0   
4               3.0         No                      9.0            4.0   

  Drained_after_socializing  Friends_circle_size  Post_frequency Personality  
0                        No                 13.0             5.0   Extrovert  
1                       Yes                  0.0             3.0   Introvert  
2                       Yes                  5.0             2.0   Introvert  
3                        No                 14.0             8.0   Extrovert  
4                        No                  8.0             5.0   Extrovert  
   Time_spent_Alone Stage_fear  Social_event_attendance  Going_outside  \
0      

In [11]:
print(data.shape) #rows, columns



(2900, 8)


In [12]:
print(data.describe()) #summary statistics


       Time_spent_Alone  Social_event_attendance  Going_outside  \
count       2837.000000              2838.000000    2834.000000   
mean           4.505816                 3.963354       3.000000   
std            3.479192                 2.903827       2.247327   
min            0.000000                 0.000000       0.000000   
25%            2.000000                 2.000000       1.000000   
50%            4.000000                 3.000000       3.000000   
75%            8.000000                 6.000000       5.000000   
max           11.000000                10.000000       7.000000   

       Friends_circle_size  Post_frequency  
count          2823.000000     2835.000000  
mean              6.268863        3.564727  
std               4.289693        2.926582  
min               0.000000        0.000000  
25%               3.000000        1.000000  
50%               5.000000        3.000000  
75%              10.000000        6.000000  
max              15.000000       10.

In [13]:
print(data.columns) #column names


Index(['Time_spent_Alone', 'Stage_fear', 'Social_event_attendance',
       'Going_outside', 'Drained_after_socializing', 'Friends_circle_size',
       'Post_frequency', 'Personality'],
      dtype='object')


In [None]:
# df.dtypes → What kind of data is in each column?
# This tells you the type of data in every column.

print(data.dtypes) #data types of each column


Time_spent_Alone             float64
Stage_fear                    object
Social_event_attendance      float64
Going_outside                float64
Drained_after_socializing     object
Friends_circle_size          float64
Post_frequency               float64
Personality                   object
dtype: object


In [None]:
# df.info() → Quick summary of the DataFrame

# Total number of rows and columns
# Names and data types of each column
# How many non-missing values are in each column
# How much memory the whole table is using

print(data.info()) #summary of the DataFrame

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2900 entries, 0 to 2899
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Time_spent_Alone           2837 non-null   float64
 1   Stage_fear                 2827 non-null   object 
 2   Social_event_attendance    2838 non-null   float64
 3   Going_outside              2834 non-null   float64
 4   Drained_after_socializing  2848 non-null   object 
 5   Friends_circle_size        2823 non-null   float64
 6   Post_frequency             2835 non-null   float64
 7   Personality                2900 non-null   object 
dtypes: float64(5), object(3)
memory usage: 181.4+ KB
None


In [None]:
print(data.isnull().any())

Time_spent_Alone              True
Stage_fear                    True
Social_event_attendance       True
Going_outside                 True
Drained_after_socializing     True
Friends_circle_size           True
Post_frequency                True
Personality                  False
dtype: bool


In [8]:
print(data.isnull().sum())

Time_spent_Alone             63
Stage_fear                   73
Social_event_attendance      62
Going_outside                66
Drained_after_socializing    52
Friends_circle_size          77
Post_frequency               65
Personality                   0
dtype: int64


In [12]:
data_updated = data.rename(columns={'Personality': 'Personality Type'})
print(data_updated.head())

   Time_spent_Alone Stage_fear  Social_event_attendance  Going_outside  \
0               4.0         No                      4.0            6.0   
1               9.0        Yes                      0.0            0.0   
2               9.0        Yes                      1.0            2.0   
3               0.0         No                      6.0            7.0   
4               3.0         No                      9.0            4.0   

  Drained_after_socializing  Friends_circle_size  Post_frequency  \
0                        No                 13.0             5.0   
1                       Yes                  0.0             3.0   
2                       Yes                  5.0             2.0   
3                        No                 14.0             8.0   
4                        No                  8.0             5.0   

  Personality Type  
0        Extrovert  
1        Introvert  
2        Introvert  
3        Extrovert  
4        Extrovert  


In [13]:
print(data.columns)

Index(['Time_spent_Alone', 'Stage_fear', 'Social_event_attendance',
       'Going_outside', 'Drained_after_socializing', 'Friends_circle_size',
       'Post_frequency', 'Personality'],
      dtype='object')


In [18]:
data_updated['New_Friends_circle_size'] = data_updated['Friends_circle_size'].apply(lambda x: x + (x %10))
data_updated.head()

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality Type,New_Friends_circle_size
0,4.0,No,4.0,6.0,No,13.0,5.0,Extrovert,16.0
1,9.0,Yes,0.0,0.0,Yes,0.0,3.0,Introvert,0.0
2,9.0,Yes,1.0,2.0,Yes,5.0,2.0,Introvert,10.0
3,0.0,No,6.0,7.0,No,14.0,8.0,Extrovert,18.0
4,3.0,No,9.0,4.0,No,8.0,5.0,Extrovert,16.0


In [None]:
# Data Aggregation and Grouping
# Grouping data by a specific column and calculating statistics

grouped_mean = data_updated.groupby('Personality Type')['New_Friends_circle_size'].mean()
print(grouped_mean)

Personality Type
Extrovert    13.543763
Introvert     5.978134
Name: New_Friends_circle_size, dtype: float64


In [28]:
print(data_updated.columns.tolist())

['Time_spent_Alone', 'Stage_fear', 'Social_event_attendance', 'Going_outside', 'Drained_after_socializing', 'Friends_circle_size', 'Post_frequency', 'Personality Type', 'New_Friends_circle_size']


In [39]:
# Groupby multiple columns and calculate statistics

grouped_data = data_updated.groupby('Personality Type')['New_Friends_circle_size'].mean()
print(grouped_data)

Personality Type
Extrovert    13.543763
Introvert     5.978134
Name: New_Friends_circle_size, dtype: float64
