### Setup and Feature Selection
Import data, convert text data to numerical data, and select features (can change chosen features in this block).

In [113]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans

df = pd.read_csv('masculinity.csv')

# drop columns irrelevant to analysis
df = df.iloc[:,3:]

features = df.copy()

#TODO
# what to do with "no answer"
# convert floats to ints?
# nothing obvious to do with q0009
# q0010 only applies to employed
# q0025 is about children, probably can do something with that
# q0026 is sexual orientation

#In general, how masculine or “manly” do you feel?
features['q0001'] = features['q0001'].map({'Not at all masculine' : 0, 'Not very masculine': 1, 'Somewhat masculine': 2, 'Very masculine': 3})

# How important is it to you that others see you as masculine?
features['q0002'] = features['q0002'].map({'Not at all important' : 0, 'Not too important': 1, 'Somewhat important': 2, 'Very important': 3})

# Where have you gotten your ideas about what it means to be a good man?
q4_cols = [col for col in features.columns if col.startswith('q0004')]
for col in q4_cols:
    features[col] = features[col].apply(lambda x: 0 if x == 'Not selected' else 1)

# Do you think that society puts pressure on men in a way that is unhealthy or bad for them?
features['q0005'] = features['q0005'].apply(lambda x: 1 if x == 'Yes' else 0)

# How often would you do a variety of things? (masculine and not)
q7_cols = [col for col in features.columns if col.startswith('q0007')]
for col in q7_cols:
    features[col] = features[col].map({'Never, and not open to it' : 0, 'Never, but open to it' : 1, 'Rarely' : 2, 'Sometimes' : 3, 'Often' : 4})

# Which of the following do you worry about on a daily or near daily basis? (masculine and not)
q8_cols = [col for col in features.columns if col.startswith('q0008')]
for col in q8_cols:
    features[col] = features[col].apply(lambda x: 0 if x == 'Not selected' else 1)

# drop all that pertain to work or are shown to subset groups q9 - q16
cols_to_drop = ['q0009', 'q0010_0001', 'q0010_0002',
       'q0010_0003', 'q0010_0004', 'q0010_0005', 'q0010_0006', 'q0010_0007',
       'q0010_0008', 'q0011_0001', 'q0011_0002', 'q0011_0003', 'q0011_0004',
       'q0011_0005', 'q0012_0001', 'q0012_0002', 'q0012_0003', 'q0012_0004',
       'q0012_0005', 'q0012_0006', 'q0012_0007', 'q0013', 'q0014', 'q0015']

features = features.drop(columns=cols_to_drop)

#Do you typically feel as though you’re expected to make the first move in romantic relationships?
features['q0017'] = features['q0017'].apply(lambda x: 1 if x == 'Yes' else 0)

# How often do you try to be the one who pays when on a date?
features['q0018'] = features['q0018'].map({'Never' : 0, 'Rarely': 1, 'Sometimes': 2, 'Often': 3, 'Always' : 4})

# drop q19, q20, q21
cols_to_drop2 = ['q0019_0001', 'q0019_0002', 'q0019_0003',
       'q0019_0004', 'q0019_0005', 'q0019_0006', 'q0019_0007', 'q0020_0001',
       'q0020_0002', 'q0020_0003', 'q0020_0004', 'q0020_0005', 'q0020_0006',
       'q0021_0001', 'q0021_0002', 'q0021_0003', 'q0021_0004']

features = features.drop(columns=cols_to_drop2)

# Have you changed your behavior in romantic relationships in the wake of #MeToo movement?
features['q0022'] = features['q0022'].apply(lambda x: 1 if x == 'Yes' else 0)

# demographics

# drop q 24, 25, 26, 28
cols_to_drop3 = ['q0024', 'q0025_0001', 'q0025_0002', 'q0025_0003', 'q0026', 'q0028']
features = features.drop(columns=cols_to_drop3)

# What is the last grade of school you completed?
features['q0029'] = features['q0029'].map({"Did not complete high school" : 0, "High school or G.E.D." : 1, "Associate's degree": 2,
                               "Some college" : 3, "College graduate" : 4, "Post graduate degree" : 5})

# drop q30 - what state do you live in?
features = features.drop(columns='q0030')

features['q0034'] = features['q0034'].map({"$0-$9,999" : 0, "$10,000-$24,999" : 1, "$25,000-$49,999": 2, "$50,000-$74,999" : 3,
                               "$75,000-$99,999" : 4, "$100,000-$124,999" : 5, "$125,000-$149,999" : 6, "$150,000-$174,999" : 7,
                               "$175,000-$199,999" : 8, "$200,000+" : 9 })

# drop q 35, 36
features = features.drop(columns=['q0035', 'q0036'])

features['race2'] = features['race2'].apply(lambda x: 1 if x == 'White' else 0)

features['educ3'] = features['educ3'].map({"High school or less" : 0, "Some college" : 1, "College or more" : 2})

features['educ4'] = features['educ4'].map({"High school or less" : 0, "Some college" : 1, "College or more" : 2, "Post graduate degree" : 3})

features['kids'] = features['kids'].apply(lambda x: 1 if x == 'Has children' else 0)

print(features.columns)

Index(['q0001', 'q0002', 'q0004_0001', 'q0004_0002', 'q0004_0003',
       'q0004_0004', 'q0004_0005', 'q0004_0006', 'q0005', 'q0007_0001',
       'q0007_0002', 'q0007_0003', 'q0007_0004', 'q0007_0005', 'q0007_0006',
       'q0007_0007', 'q0007_0008', 'q0007_0009', 'q0007_0010', 'q0007_0011',
       'q0008_0001', 'q0008_0002', 'q0008_0003', 'q0008_0004', 'q0008_0005',
       'q0008_0006', 'q0008_0007', 'q0008_0008', 'q0008_0009', 'q0008_0010',
       'q0008_0011', 'q0008_0012', 'q0017', 'q0018', 'q0022', 'q0029', 'q0034',
       'race2', 'racethn4', 'educ3', 'educ4', 'age3', 'kids', 'orientation',
       'weight'],
      dtype='object')


### Data Visualizations

In [None]:
cols = ['q0001', 'q0002']

subset = features[cols]

subset = subset.dropna()

print(pd.crosstab(subset[cols[0]], subset[cols[1]]))

plt.scatter(subset[cols[0]], subset[cols[1]], alpha = 0.1)
plt.show()

### Model #1 

Going to use 2 clusters in each model: the "traditionally masculine" group and the "not traditionally masculine" group, theoretically.

Goal: Include as many features that apply to everyone, create a model, see what groupings it comes up with, analyze the groups

Shortcomings: probably too many features, some features may not even be relevant, many removed rows from nan,

In [100]:
features

Unnamed: 0,q0001,q0002,q0004_0001,q0004_0002,q0004_0003,q0004_0004,q0004_0005,q0004_0006,q0005,q0007_0001,...,q0035,q0036,race2,racethn4,educ3,educ4,age3,kids,orientation,weight
0,2.0,2.0,0,0,0,1,0,0,1,4.0,...,Middle Atlantic,Windows Desktop / Laptop,0,Hispanic,2.0,2.0,35 - 64,0,Gay/Bisexual,1.714026
1,2.0,2.0,1,0,0,0,0,0,1,2.0,...,East North Central,iOS Phone / Tablet,1,White,1.0,1.0,65 and up,1,Straight,1.247120
2,3.0,1.0,1,0,0,0,0,1,0,3.0,...,East North Central,Windows Desktop / Laptop,1,White,2.0,2.0,35 - 64,1,Straight,0.515746
3,3.0,1.0,1,1,1,0,0,0,0,2.0,...,East North Central,Windows Desktop / Laptop,1,White,1.0,1.0,65 and up,1,No answer,0.600640
4,3.0,3.0,0,0,1,0,0,0,1,3.0,...,East North Central,Windows Desktop / Laptop,1,White,2.0,2.0,35 - 64,0,Straight,1.033400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1184,2.0,0.0,0,1,0,0,0,0,1,3.0,...,East North Central,Windows Desktop / Laptop,0,Other,1.0,1.0,35 - 64,0,Gay/Bisexual,1.571755
1185,2.0,2.0,0,0,0,0,0,1,1,3.0,...,New England,Windows Desktop / Laptop,1,White,0.0,0.0,35 - 64,1,Straight,2.446719
1186,3.0,3.0,1,1,0,0,0,0,0,4.0,...,Pacific,iOS Phone / Tablet,1,White,1.0,1.0,65 and up,1,Straight,0.615180
1187,2.0,3.0,1,1,0,1,0,0,1,1.0,...,Middle Atlantic,iOS Phone / Tablet,0,Other,2.0,2.0,18 - 34,0,Straight,2.965508
