In [164]:
import pandas as pd
from scipy.spatial.distance import pdist, squareform

## Data cleaning

In [20]:
restricted = pd.read_csv("../restricted_data/restricted_subject_data.csv")
unrestricted = pd.read_csv("../restricted_data/unrestricted_subject_data.csv")

In [34]:
restricted_columns = [
    "Subject",
    "Age_in_Yrs",
    "HasGT",
    "ZygositySR",
    "ZygosityGT",
    "Family_ID",
    "Mother_ID",
    "Father_ID",
    "Race",
    "Ethnicity",
]

unrestricted_columns = ["Subject", "Gender"]

restricted = restricted.loc[:, restricted_columns]
unrestricted = unrestricted.loc[:, unrestricted_columns]

data = pd.merge(restricted, unrestricted, on="Subject")

In [55]:
# Convert labels of Zygosity
to_replace = dict(NotTwin="NotTwin", NotMZ="DZ", MZ="MZ")

data.ZygositySR.replace(to_replace, inplace=True)

In [94]:
# check for difference in self report and genotyping

for rdx, row in data.iterrows():
    hasGT = row.HasGT
    SR = row.ZygositySR
    GT = row.ZygosityGT

    if hasGT:
        if SR == "MZ" and GT == "MZ":
            data.loc[rdx, "Zygosity"] = "MZ"
        if SR == "DZ" and GT == "MZ":
            data.loc[rdx, "Zygosity"] = "MZ"
        elif SR == "DZ" and GT == "DZ":
            data.loc[rdx, "Zygosity"] = "DZ"
        elif GT == " ":
            data.loc[rdx, "Zygosity"] = SR
    else:
        data.loc[rdx, "Zygosity"] = SR

In [97]:
data.to_csv("../restricted_data/processed.csv", index=False)

## Removing subjects
1. without scans
2. without connectomes
3. singletons
4. subsample to two subjects

In [127]:
data = pd.read_csv("../restricted_data/processed.csv")

subjects_without_scans = np.loadtxt("../data/subjects_without_dwi_t1.txt", dtype=int)
data = data[~data.Subject.isin(subjects_without_scans)]

subjects_with_connectomes = np.loadtxt(
    "../data/subjects_with_all_connectomes.txt", dtype=int
)
data = data[data.Subject.isin(subjects_with_connectomes)]
data.sort_values("Subject", inplace=True)

# remove singletons
uniques, counts = np.unique(data.Family_ID, return_counts=True)
singletons = []

for unique, count in zip(uniques, counts):
    if count == 1:
        sub = data[data.Family_ID == unique].Subject.iloc[0]
        singletons.append(sub)

singleton_idx = data.Subject.isin(singletons).values

data = data[~singleton_idx]


male_idx = data.Gender == "M"
female_idx = data.Gender == "F"

# data.to_csv("../restricted_data/944subjects.csv", index=False)

In [257]:
# subsample the data to have 2 subs per fam
fams, counts = np.unique(data.Family_ID, return_counts=True)

subsampled_df = []

others = []

np.random.seed(1)
for fam, count in zip(fams, counts):
    if count > 2:
        fam_df = data.loc[data.Family_ID == fam]
        zygs, zyg_counts = np.unique(fam_df.Zygosity, return_counts=True)

        # check if there twins
        if "MZ" in zygs:
            twin_counts = zyg_counts[zygs == "MZ"]
            if twin_counts == 2:
                subsampled_df.append(fam_df[fam_df.Zygosity == "MZ"])
            elif twin_counts > 2:
                raise ValueError(f"More than 2 MZ twin for fam: {fam}")
            elif twin_counts == 1:
                if len(fam.split("_")) == 3:
                    mdx = fam_df.Mother_ID.values
                    fdx = fam_df.Father_ID.values

                    if len(set(mdx)) > 1:  # two moms
                        m, c = np.unique(mdx, return_counts=True)
                        mdx = m[c == 2]
                        if len(set(mdx)) > 1:
                            raise ValueError()
                        fam_df_wo_halfsibs = fam_df[fam_df.Mother_ID == mdx[0]]
                    elif len(set(fdx)) > 1:
                        f, c = np.unique(fdx, return_counts=True)
                        fdx = f[c == 2]
                        if len(set(fdx)) > 1:
                            raise ValueError()
                        fam_df_wo_halfsibs = fam_df[fam_df.Father_ID == fdx[0]]
                    else:
                        raise ValueError(f"{fam}")
                    if fam_df_wo_halfsibs.shape[0] > 2:
                        raise ValueError()
                    subsampled_df.append(fam_df_wo_halfsibs)
                elif len(fam.split("_")) == 2:
                    genders, c = np.unique(fam_df.Gender, return_counts=True)

                    if (c > 1).sum() == 1:  # only one gender pairing possible
                        gender = genders[c > 1][0]
                        fam_df_same_gender = fam_df[fam_df.Gender == gender]
                        nrows = fam_df_same_gender.shape[0]

                        if nrows == 2:
                            subsampled_df.append(fam_df_same_gender)
                        elif nrows > 2:
                            random_subjects = np.random.choice(
                                fam_df_same_gender.Subject, 2, replace=False
                            )
                            subsampled_df.append(
                                fam_df_same_gender[
                                    fam_df_same_gender.Subject.isin(random_subjects)
                                ]
                            )
                        else:
                            raise ValueError()
                    elif (c > 1).sum() == 2:
                        random_gender = np.random.choice(["M", "F"], 1)[0]
                        fam_df_same_gender = fam_df[fam_df.Gender == random_gender]
                        nrows = fam_df_same_gender.shape[0]

                        if nrows == 2:
                            subsampled_df.append(fam_df_same_gender)
                        elif nrows > 2:
                            random_subjects = np.random.choice(
                                fam_df_same_gender.Subject, 2, replace=False
                            )
                            subsampled_df.append(
                                fam_df_same_gender[
                                    fam_df_same_gender.Subject.isin(random_subjects)
                                ]
                            )
                        else:
                            raise ValueError()
        elif "DZ" in zygs:
            twin_counts = zyg_counts[zygs == "DZ"]
            if twin_counts == 2:
                subsampled_df.append(fam_df[fam_df.Zygosity == "DZ"])
            elif twin_counts > 2:
                raise ValueError(f"More than 2 DZ twin for fam: {fam}")
            elif twin_counts == 1:
                if len(fam.split("_")) == 3:
                    mdx = fam_df.Mother_ID.values
                    fdx = fam_df.Father_ID.values

                    if len(set(mdx)) > 1:  # two moms
                        m, c = np.unique(mdx, return_counts=True)
                        mdx = m[c == 2]
                        if len(set(mdx)) > 1:
                            raise ValueError()
                        fam_df_wo_halfsibs = fam_df[fam_df.Mother_ID == mdx[0]]
                    elif len(set(fdx)) > 1:
                        f, c = np.unique(fdx, return_counts=True)
                        fdx = f[c == 2]
                        if len(set(fdx)) > 1:
                            raise ValueError()
                        fam_df_wo_halfsibs = fam_df[fam_df.Father_ID == fdx[0]]
                    else:
                        raise ValueError(f"{fam}")
                    if fam_df_wo_halfsibs.shape[0] > 2:
                        raise ValueError()
                    subsampled_df.append(fam_df_wo_halfsibs)
                elif len(fam.split("_")) == 2:
                    genders, c = np.unique(fam_df.Gender, return_counts=True)

                    if (c > 1).sum() == 1:  # only one gender pairing possible
                        gender = genders[c > 1][0]
                        fam_df_same_gender = fam_df[fam_df.Gender == gender]
                        nrows = fam_df_same_gender.shape[0]

                        if nrows == 2:
                            subsampled_df.append(fam_df_same_gender)
                        elif nrows > 2:
                            random_subjects = np.random.choice(
                                fam_df_same_gender.Subject, 2, replace=False
                            )
                            subsampled_df.append(
                                fam_df_same_gender[
                                    fam_df_same_gender.Subject.isin(random_subjects)
                                ]
                            )
                        else:
                            raise ValueError()
                    elif (c > 1).sum() == 2:
                        random_gender = np.random.choice(["M", "F"], 1)[0]
                        fam_df_same_gender = fam_df[fam_df.Gender == random_gender]
                        nrows = fam_df_same_gender.shape[0]

                        if nrows == 2:
                            subsampled_df.append(fam_df_same_gender)
                        elif nrows > 2:
                            random_subjects = np.random.choice(
                                fam_df_same_gender.Subject, 2, replace=False
                            )
                            subsampled_df.append(
                                fam_df_same_gender[
                                    fam_df_same_gender.Subject.isin(random_subjects)
                                ]
                            )
                        else:
                            raise ValueError()
        elif len(fam.split("_")) == 2:  # everything else
            # subsample in the following way
            # 1. same sex pair
            # 2. if more than 1 same sex pair, then choose one with smallest age difference
            # 3. if both female and male pairs exist, then choose one with smallest age difference
            genders, c = np.unique(fam_df.Gender, return_counts=True)

            if (c > 1).sum() == 1:  # only one gender pairing possible
                gender = genders[c > 1][0]
                fam_df_same_gender = fam_df[fam_df.Gender == gender]
                nrows = fam_df_same_gender.shape[0]

                if nrows == 2:
                    subsampled_df.append(fam_df_same_gender)
                elif nrows > 2:
                    random_subjects = np.random.choice(
                        fam_df_same_gender.Subject, 2, replace=False
                    )
                    subsampled_df.append(
                        fam_df_same_gender[
                            fam_df_same_gender.Subject.isin(random_subjects)
                        ]
                    )
                else:
                    raise ValueError()
            elif (c > 1).sum() == 2:
                random_gender = np.random.choice(["M", "F"], 1)[0]
                fam_df_same_gender = fam_df[fam_df.Gender == random_gender]
                nrows = fam_df_same_gender.shape[0]

                if nrows == 2:
                    subsampled_df.append(fam_df_same_gender)
                elif nrows > 2:
                    random_subjects = np.random.choice(
                        fam_df_same_gender.Subject, 2, replace=False
                    )
                    subsampled_df.append(
                        fam_df_same_gender[
                            fam_df_same_gender.Subject.isin(random_subjects)
                        ]
                    )
                else:
                    raise ValueError()

        elif len(fam.split("_")) == 3:  # fams with half sibs
            mdx = fam_df.Mother_ID.values
            fdx = fam_df.Father_ID.values

            if len(set(mdx)) > 1:  # two moms
                m, c = np.unique(mdx, return_counts=True)
                mdx = m[c == 2]
                if len(set(mdx)) > 1:
                    raise ValueError()
                fam_df_wo_halfsibs = fam_df[fam_df.Mother_ID == mdx[0]]
            elif len(set(fdx)) > 1:
                f, c = np.unique(fdx, return_counts=True)
                fdx = f[c == 2]
                if len(set(fdx)) > 1:
                    raise ValueError()
                fam_df_wo_halfsibs = fam_df[fam_df.Father_ID == fdx[0]]
            else:
                raise ValueError()

            if fam_df_wo_halfsibs.shape[0] > 2:
                raise ValueError()
            subsampled_df.append(fam_df_wo_halfsibs)
        else:
            raise ValueError(f"This family was not considered: {fam}")

In [259]:
subsampled_df = pd.concat(subsampled_df, axis=0)

In [269]:
twos = data[data.Family_ID.isin(fams[counts == 2])]

In [270]:
final = pd.concat([subsampled_df, twos], axis=0)

In [272]:
final.to_csv("../restricted_data/700subjects.csv")

In [273]:
final.shape

(700, 12)

In [288]:
to_remove = np.array(
    [256540, 549757, 179245, 849264, 108525, 146331, 204521, 214221, 284646, 979984]
)

In [292]:
final = final[~final.Subject.isin(to_remove)]

In [294]:
final.to_csv("../restricted_data/690subjects.csv")

## Compute dataframe for same sex MZ and DZ

In [274]:
twins_df = final[final.Zygosity.isin(["MZ", "DZ"])]

In [277]:
fams, counts = np.unique(twins_df.Family_ID, return_counts=True)

In [281]:
out = []

for fam, count in zip(fams, counts):
    if count == 1:
        continue
    elif count == 2:
        tmp = twins_df[twins_df.Family_ID == fam]
        zyg = np.unique(tmp.Zygosity)
        if len(zyg) > 1:
            raise ValueError(f"")

        if zyg[0] == "MZ":
            out.append(tmp)
        elif zyg[0] == "DZ":
            gs = np.unique(tmp.Gender)

            if len(gs) == 1:
                out.append(tmp)
            elif len(gs) > 2:
                raise ValueError()
        else:
            raise ValueError()

    else:
        raise ValueError()

In [283]:
same_sex_twins = pd.concat(out, axis=0)

In [299]:
same_sex_twins[~same_sex_twins.Subject.isin(to_remove)].to_csv(
    "../restricted_data/same_sex_twins.csv", index=False
)

In [287]:
same_sex_twins.to_csv("../restricted_data/same_sex_twins.csv")