In [15]:
import pandas as pd
import re

In [82]:
zodiac = pd.read_csv("survey responses.csv")
zodiac

Unnamed: 0,birthday,zodiac sign,element,age range,sex,personality traits,occupation
0,November 4,Scorpio,water,40-49,Man,"curious, trustworthy, passionate, proactive",Journalist
1,May 7,Taurus,earth,20-29,Woman,"ambitious, imaginative, practical, sensitive",Journalist
2,August 7,Leo,fire,40-49,Woman,"ambitious, spontaneous, practical, humanitarian",Journalist
3,March 11,Pisces,water,20-29,Man,"ambitious, curious, spontaneous, sensitive",Journalist
4,May 6,Taurus,earth,50-59,Woman,"ambitious, loyal, passionate, proactive","STEM-related field (computer science, engineer, doctor, researcher, etc.)"
...,...,...,...,...,...,...,...
98,March 19,Pisces,water,17-19,Woman,"optimist, imaginative, intuitive, sensitive","Student, STEM-related field (computer science, engineer, doctor, researcher, etc.)"
99,March 5,Pisces,water,50-59,Woman,"spontaneous, passionate, intuitive, spiritual",Homemaker
100,Aug 6,Leo,fire,20-29,Woman,"trustworthy, spontaneous, practical, sensitive","STEM-related field (computer science, engineer, doctor, researcher, etc.)"
101,June 13,Gemini,air,20-29,Man,"trustworthy, optimist, reserved, patient","Student, STEM-related field (computer science, engineer, doctor, researcher, etc.)"


In [83]:
df = pd.DataFrame(zodiac)

In [84]:
# Check for duplicate rows in the DataFrame
duplicates = df[df.duplicated(keep=False)]

print("Duplicate rows:")
print(duplicates)

Duplicate rows:
Empty DataFrame
Columns: [birthday, zodiac sign, element, age range, sex, personality traits, occupation]
Index: []


In [85]:
#cleaning white spaces
# Clean the 'zodiac sign' column by stripping whitespaces and converting to lowercase
df['zodiac sign'] = df['zodiac sign'].str.strip().str.lower()
df['zodiac sign']

0      scorpio
1       taurus
2          leo
3       pisces
4       taurus
        ...   
98      pisces
99      pisces
100        leo
101     gemini
102      virgo
Name: zodiac sign, Length: 103, dtype: object

In [86]:
# Print unique values in the 'zodiac sign' column along with their lengths
unique_zodiac_signs = df['zodiac sign'].unique()
for sign in unique_zodiac_signs:
    print(f"{sign}: {len(sign)}")

scorpio: 7
taurus: 6
leo: 3
pisces: 6
capricorn: 9
libra: 5
sagittarius: 11
gemini: 6
cancer: 6
aries: 5
virgo: 5
aquarius: 8


In [88]:
# Replace non-breaking spaces with regular spaces
df['zodiac sign'] = df['zodiac sign'].str.replace('\xa0', ' ')
df['zodiac sign']

0      scorpio
1       taurus
2          leo
3       pisces
4       taurus
        ...   
98      pisces
99      pisces
100        leo
101     gemini
102      virgo
Name: zodiac sign, Length: 103, dtype: object

In [89]:
# Count the occurrences of each zodiac sign
zodiac_counts = df['zodiac sign'].value_counts()
zodiac_counts

zodiac sign
leo            14
libra          12
scorpio        11
virgo          11
pisces         10
capricorn       8
gemini          8
taurus          7
cancer          7
sagittarius     6
aquarius        5
aries           4
Name: count, dtype: int64

In [90]:
age_counts = df['age range'].value_counts()
age_counts

age range
20-29    61
50-59    16
40-49     9
30-39     8
60-69     5
17-19     3
70-79     1
Name: count, dtype: int64

In [91]:
element_counts = df['element'].value_counts()
element_counts

element
water    28
earth    26
air      25
fire     24
Name: count, dtype: int64

0          curious, trustworthy, passionate, proactive
1         ambitious, imaginative, practical, sensitive
2      ambitious, spontaneous, practical, humanitarian
3           ambitious, curious, spontaneous, sensitive
4              ambitious, loyal, passionate, proactive
                            ...                       
98         optimist, imaginative, intuitive, sensitive
99       spontaneous, passionate, intuitive, spiritual
100     trustworthy, spontaneous, practical, sensitive
101           trustworthy, optimist, reserved, patient
102          ambitious, loyal, practical, humanitarian
Name: personality traits, Length: 103, dtype: object

In [92]:
# Group the data by zodiac sign
grouped_by_sign = df.groupby('zodiac sign')

# Dictionary to store the most common personality trait for each sign
most_common_traits = {}

# Loop through each group and find the most common personality trait
for sign, group in grouped_by_sign:
    # Flatten the personality traits for each group into a single list
    traits_list = [trait for traits in group['personality traits'] for trait in traits.split(', ')]
    # Count occurrences of each trait
    trait_counts = pd.Series(traits_list).value_counts()
    # Get the most common trait
    most_common_trait = trait_counts.index[0]
    most_common_traits[sign] = most_common_trait

# Print the results
for sign, trait in most_common_traits.items():
    print(f"zodiac sign: {sign}, most common personality trait: {trait}")

zodiac sign: aquarius, most common personality trait: proactive
zodiac sign: aries, most common personality trait: loyal
zodiac sign: cancer, most common personality trait: loyal
zodiac sign: capricorn, most common personality trait: practical
zodiac sign: gemini, most common personality trait: curious
zodiac sign: leo, most common personality trait: loyal
zodiac sign: libra, most common personality trait: trustworthy
zodiac sign: pisces, most common personality trait: imaginative
zodiac sign: sagittarius, most common personality trait: proactive
zodiac sign: scorpio, most common personality trait: curious
zodiac sign: taurus, most common personality trait: ambitious
zodiac sign: virgo, most common personality trait: curious


In [93]:
# Group the data by age range
grouped_by_age = df.groupby('age range')

# Dictionary to store the most common personality trait for each age range
most_common_traits_by_age = {}

# Loop through each group and find the most common personality trait
for age_range, group in grouped_by_age:
    # Flatten the personality traits for each group into a single list
    traits_list = [trait for traits in group['personality traits'] for trait in traits.split(', ')]
    # Count occurrences of each trait
    trait_counts = pd.Series(traits_list).value_counts()
    # Get the most common trait
    most_common_trait = trait_counts.index[0]
    most_common_traits_by_age[age_range] = most_common_trait

# Print the results
for age_range, trait in most_common_traits_by_age.items():
    print(f"age range: {age_range}, most common personality trait: {trait}")

age range: 17-19, most common personality trait: trustworthy
age range: 20-29, most common personality trait: loyal
age range: 30-39, most common personality trait: imaginative
age range: 40-49, most common personality trait: trustworthy
age range: 50-59, most common personality trait: trustworthy
age range: 60-69, most common personality trait: passionate
age range: 70-79, most common personality trait: ambitious


In [94]:
# what does your zodiac sign have to do with your profession? 
occupation_counts = df['occupation'].value_counts()
occupation_counts

occupation
STEM-related field (computer science, engineer, doctor, researcher, etc.)                                                                           36
Journalist                                                                                                                                          15
Student                                                                                                                                             10
Student, STEM-related field (computer science, engineer, doctor, researcher, etc.)                                                                   8
Communications                                                                                                                                       4
Creative (graphics person, visual artist, musician, etc.)                                                                                            4
Homemaker                                                                          

In [107]:
from prettytable import PrettyTable

# Group the data by zodiac sign
grouped_by_sign = df.groupby('zodiac sign')

# Dictionary to store the most common profession and count for each zodiac sign
most_common_professions = {}

# Loop through each group and find the most common profession
for sign, group in grouped_by_sign:
    # Count occurrences of each profession
    profession_counts = group['occupation'].value_counts()
    # Get the most common profession and its count
    most_common_profession = profession_counts.index[0]
    most_common_count = profession_counts.iloc[0]  # Count of the most common profession
    most_common_professions[sign] = (most_common_profession, most_common_count)

# Create a PrettyTable instance
table = PrettyTable()
table.field_names = ["Zodiac Sign", "Most Common Profession", "Count"]

# Add data to the table
for sign, (profession, count) in most_common_professions.items():
    table.add_row([sign, profession, count])

# Print the table
print(table)

+-------------+---------------------------------------------------------------------------+-------+
| Zodiac Sign |                           Most Common Profession                          | Count |
+-------------+---------------------------------------------------------------------------+-------+
|   aquarius  |                               Communications                              |   2   |
|    aries    | STEM-related field (computer science, engineer, doctor, researcher, etc.) |   1   |
|    cancer   | STEM-related field (computer science, engineer, doctor, researcher, etc.) |   2   |
|  capricorn  | STEM-related field (computer science, engineer, doctor, researcher, etc.) |   2   |
|    gemini   | STEM-related field (computer science, engineer, doctor, researcher, etc.) |   4   |
|     leo     | STEM-related field (computer science, engineer, doctor, researcher, etc.) |   7   |
|    libra    | STEM-related field (computer science, engineer, doctor, researcher, etc.) |   3   |


In [96]:
# Group the data by gender
grouped_by_gender = df.groupby('sex')

# Dictionary to store the most common personality trait for each gender
most_common_traits_by_gender = {}

# Loop through each group and find the most common personality trait
for gender, group in grouped_by_gender:
    # Flatten the personality traits for each group into a single list
    traits_list = [trait for traits in group['personality traits'] for trait in traits.split(', ')]
    # Count occurrences of each trait
    trait_counts = pd.Series(traits_list).value_counts()
    # Get the most common trait
    most_common_trait = trait_counts.index[0]
    most_common_traits_by_gender[gender] = most_common_trait

# Print the results
for gender, trait in most_common_traits_by_gender.items():
    print(f"gender: {gender}, most common personality trait: {trait}")

gender: Man, most common personality trait: trustworthy
gender: Prefer not to say, most common personality trait: reserved
gender: Woman, most common personality trait: loyal


In [97]:
# Group the data by gender, zodiac sign, and age range
grouped_by_attributes = df.groupby(['sex', 'zodiac sign', 'age range'])

# Dictionary to store the most common personality trait for each combination
most_common_traits_by_attributes = {}

# Loop through each group and find the most common personality trait
for attributes, group in grouped_by_attributes:
    # Flatten the personality traits for each group into a single list
    traits_list = [trait for traits in group['personality traits'] for trait in traits.split(', ')]
    # Count occurrences of each trait
    trait_counts = pd.Series(traits_list).value_counts()
    # Get the most common trait
    most_common_trait = trait_counts.index[0]
    most_common_traits_by_attributes[attributes] = most_common_trait

# Print the results
for attributes, trait in most_common_traits_by_attributes.items():
    gender, zodiac_sign, age_range = attributes
    print(f"gender: {gender}, zodiac Sign: {zodiac_sign}, age range: {age_range}, most common personality trait: {trait}")

gender: Man, zodiac Sign: aries, age range: 20-29, most common personality trait: curious
gender: Man, zodiac Sign: cancer, age range: 20-29, most common personality trait: trustworthy
gender: Man, zodiac Sign: cancer, age range: 50-59, most common personality trait: ambitious
gender: Man, zodiac Sign: capricorn, age range: 50-59, most common personality trait: trustworthy
gender: Man, zodiac Sign: gemini, age range: 20-29, most common personality trait: optimist
gender: Man, zodiac Sign: gemini, age range: 40-49, most common personality trait: curious
gender: Man, zodiac Sign: gemini, age range: 60-69, most common personality trait: trustworthy
gender: Man, zodiac Sign: leo, age range: 20-29, most common personality trait: creative
gender: Man, zodiac Sign: leo, age range: 50-59, most common personality trait: loyal
gender: Man, zodiac Sign: leo, age range: 60-69, most common personality trait: curious
gender: Man, zodiac Sign: libra, age range: 17-19, most common personality trait: "

In [98]:
# Group the data by element
grouped_by_element = df.groupby('element')

# Dictionary to store the most common personality trait for each element
most_common_traits_by_element = {}

# Loop through each group and find the most common personality trait
for element, group in grouped_by_element:
    # Flatten the personality traits for each group into a single list
    traits_list = [trait for traits in group['personality traits'] for trait in traits.split(', ')]
    # Count occurrences of each trait
    trait_counts = pd.Series(traits_list).value_counts()
    # Get the most common trait
    most_common_trait = trait_counts.index[0]
    most_common_traits_by_element[element] = most_common_trait

# Print the results
for element, trait in most_common_traits_by_element.items():
    print(f"element: {element}, most common personality trait: {trait}")

element: air, most common personality trait: trustworthy
element: earth, most common personality trait: ambitious
element: fire, most common personality trait: loyal
element: water, most common personality trait: curious


In [101]:
# same birthday?
birthday_counts = df['birthday'].value_counts()

# Print the results
birthday_counts

birthday
September 1     3
June 13         2
October 5       2
November 13     2
July 29         2
               ..
February 17     1
March 31        1
January 17      1
June 19         1
September 10    1
Name: count, Length: 93, dtype: int64