##XP-EHH anlysis

In [None]:
!pip install -qq malariagen_data
import malariagen_data
import numpy as np
import pandas as pd
import allel
import zarr
import matplotlib.pyplot as plt
import seaborn as sns

ag3 = malariagen_data.Ag3()
ag3

In [None]:
# Contigs and cohorts setup
#GSS with iHs
def run_XP_EHH(taxon1, taxon2, contig,  country, window_size=200):
  """
  function that compute xp_ehh value for between twon taxon for a given contig
  """
  try:
    pos, xp_ehh = ag3.xpehh_gwss(
            contig=contig,
            cohort1_query = f"country=='{country}' and taxon=='{taxon1}'",
            cohort2_query = f"country=='{country}' and taxon=='{taxon2}'",
            window_size=window_size,
            analysis="gamb_colu",
    )
    return pos, xp_ehh # x: ndarray xp_ehh: ndarray
  except Exception as e:
    print(f"Error running XP-EHH: {e}")
    return None, None

def run_XP_EHH_loc(taxon, cohort1, cohort2, country ,contig, window_size=200):
  """
  function that compute xp_ehh value for between twon subpopulation for a given contig
  """
  list_cohort_group = ['country_iso', 'admin1_name', 'admin1_iso', 'admin2_name', 'cohort_admin1_year']
  print("cohort group list:")
  for i in list_cohort_group:
    print(i)
  cohort_group = input("Enter the cohort group to use: ")
  if cohort_group not in list_cohort_group or cohort_group is None:
    print("Invalid cohort group. Please choose from the list.")
    return
  try:
    pos, xp_ehh = ag3.xpehh_gwss(
            contig=contig,
            cohort1_query = f"country=='{country}' and taxon=='{taxon}' and {cohort_group} == '{cohort1}'",
            cohort2_query = f"country=='{country}' and taxon=='{taxon}' and {cohort_group} == '{cohort2}'",
            window_size=window_size,
            analysis="gamb_colu",
            min_cohort_size=10,
            max_cohort_size=100
    )
    return pos, xp_ehh # x: ndarray xp_ehh: ndarray
  except Exception as e:
    print(f"Error running XP-EHH: {e}")
    return None, None

In [None]:
# Initialize an empty DataFrame to store results
final_xp_ehh_df = pd.DataFrame()
taxon1 = 'bissau'
taxon2 = 'gambiae'
#contigs = ['2L', '2R', '3L', '3R', 'X']
contigs=["3R"]
country = 'Gambia, The'  # Ensure the country variable is defined
window_size = 200  # Define the window size
cohort_results = []  # Initialize an empty list to collect results

for contig in contigs:
    print(f"Processing contig: {contig}")

    # Run the genome-wide scan for selection (XP-EHH)
    try:
        # Assuming run_XP_EHH returns positions and XP-EHH values
        pos, xp_ehh = run_XP_EHH(taxon1, taxon2, contig, country, window_size=window_size)
        # add an empty dimension to XP-EHH array if 1D
        xp_ehh = np.reshape(xp_ehh, (xp_ehh.shape[0], -1))
        for i in range(xp_ehh.shape[1]):
           xp_ehh_perc = xp_ehh[:, i]

        # Append results for the current contig to cohort_results
        cohort_results.append({
            "chrom": contig,
            "Chr_pos": pos,        # Chromosomal positions
            "xp_ehh": xp_ehh_perc,      # XP-EHH values
            "window_size": window_size,  # Window size used
            "taxon_1": taxon1,
            "taxon_2": taxon2,
            "country": country
        })
        print(f"XP-EHH scan completed for {contig}.")
    except Exception as e:
        print(f"Error running XP-EHH for {contig}: {e}")
        continue

# Process results and append to the final DataFrame
for result in cohort_results:
    temp_df = pd.DataFrame({
        "Chr_pos": result["Chr_pos"],
        "xp_ehh": result["xp_ehh"],
        "chrom": result["chrom"],
        "window_size": result["window_size"],
        "taxon_1": result["taxon_1"],
        "taxon_2": result["taxon_2"],
        "country": result["country"]
    })
    final_xp_ehh_df = pd.concat([final_xp_ehh_df, temp_df], ignore_index=True)

print("XP-EHH selection scan complete.")

# Save the final results to a CSV file
final_xp_ehh_df.to_csv(f'XP_EHH_{taxon1}_vs_{taxon2}_results.csv', index=False)
final_xp_ehh_df

##XP-EHH plot

In [None]:
# List of unique chromosomes
final_xp_ehh_df = pd.read_csv('/content/XP_EHH_bissau_vs_gambiae_results.csv')
chromosomes = final_xp_ehh_df['chrom'].unique()

# Define color map for cohorts
cohorts = final_xp_ehh_df['taxon_1'].unique()  # Assuming cohorts are defined by `taxon_1`
colors = plt.cm.tab20.colors  # Color palette
color_map = [colors[i % len(colors)] for i in range(len(cohorts))]

# Determine subplot grid size
ncols = 2
nrows = (len(chromosomes) + 1) // ncols

# Create subplots
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(16, 7 * nrows), sharex=False, sharey=False)
axes = axes.flatten()  # Flatten the axes array for easy indexing

# Loop through each chromosome and create a plot
for i, chrom in enumerate(chromosomes):
    ax = axes[i]
    chr_data = final_xp_ehh_df.query(f"chrom == '{chrom}'")

    # Unique cohorts
    cohorts = chr_data['taxon_1'].unique()  # Assuming `taxon_1` defines cohorts
    x_ticks = []
    x_labels = []
    current_x = 0

    for idx, cohort in enumerate(cohorts):
        # Filter data for the current cohort
        cohort_data = chr_data[chr_data['taxon_1'] == cohort]

        # X values: Positions adjusted by current_x offset
        x = cohort_data['Chr_pos'] + current_x
        y = cohort_data['xp_ehh']  # Use XP-EHH values

        # Scatter plot for current cohort
        ax.scatter(x, y, color=color_map[idx], s=10, label=f'{cohort}')

        # Add ticks and labels for cohorts
        x_ticks.append(x.median())  # Use median position for tick
        x_labels.append(f'{cohort}')

        # Update current_x for the next cohort
        current_x += cohort_data['Chr_pos'].max() + 50  # Add spacing between cohorts

    # Highlight the threshold line
    """threshold = chr_data['xp_ehh'].quantile(0.99)  # Top 1% as threshold
    ax.axhline(y=threshold, color='black', linestyle='--', linewidth=1)"""

    # Set title for the subplot
    ax.set_title(f'XP-EHH Plot for Chromosome {chrom}', fontsize=12)
    ax.set_ylabel('XP-EHH')
    ax.legend(loc='upper right', fontsize=8)

    # Set x-axis ticks and labels
    ax.set_xticks(ticks=x_ticks)
    ax.set_xticklabels(labels=x_labels, rotation=45, ha='right')

# Remove empty subplots if the number of chromosomes is not a multiple of 2
for j in range(len(chromosomes), len(axes)):
    fig.delaxes(axes[j])

# Set global x-axis label
fig.supxlabel('Cohorts', fontsize=14)

# Adjust layout
plt.tight_layout()

# Save the plot
plt.savefig('xp_ehh_per_cohort.png', dpi=300)

# Display the plot
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load XP-EHH result
df = pd.read_csv("/content/XP_EHH_bissau_vs_gambiae_results.csv", sep=",")  # Adjust separator
df
# Ensure proper types
df = df.dropna(subset=['chrom', 'Chr_pos', 'xp_ehh'])
df['chrom'] = df['chrom'].astype(str)

# Convert chromosome to ordered category if numeric
try:
    df['chrom'] = pd.Categorical(df['chrom'], ordered=True, categories=sorted(df['chrom'].unique(), key=lambda x: int(x)))
except:
    pass

# Plot
plt.figure(figsize=(14, 6))
colors = ['skyblue', 'steelblue']
for i, (chrom, group) in enumerate(df.groupby('chrom')):
    plt.scatter(group['Chr_pos'], group['xp_ehh'],
                s=8, label=f'Chr {chrom}', alpha=0.6, color=colors[i % 2])

"""# Optional threshold line
plt.axhline(2, color='red', linestyle='--', lw=1)
plt.axhline(-2, color='red', linestyle='--', lw=1)"""

# Axis and labels
plt.xlabel("Position (bp)", fontsize=12)
plt.ylabel("XP-EHH Score", fontsize=12)
plt.title("XP-EHH per Chromosome", fontsize=14)
plt.legend(title='Chromosome', bbox_to_anchor=(1.01, 1), loc='upper left', fontsize=8)
plt.tight_layout()
plt.grid(alpha=0.3)

plt.show()


In [None]:
taxon1 = 'bissau'
taxon2 = 'gambiae'
contig = '3R'
country = 'Gambia, The'
window_size = 200
gene_labels = ['Or13', 'Or15', 'Or16', 'Or17', 'Or30', 'Or46', 'Or47', 'Or53', 'Or55']
ag3.plot_xpehh_gwss(
            contig=contig,
            cohort1_query = f"country=='{country}' and taxon=='{taxon1}'",
            cohort2_query = f"country=='{country}' and taxon=='{taxon2}'",
            window_size=window_size,
            analysis="gamb_colu",
            max_cohort_size=100,
)

The XP-EHH on 3R chrom shows 2 to 3 pic corresponding to difference in term on selection between Bissau molecular form and An. gambiae. In that list we have the 28.5bk region containing Gste genes which seems to be particular selected in An. gambiae population while the the **32kb where Ors genes** are located seems to be selected in Bissau molecular form. Another pic found around 25.74kb where Or29 is located is only selected in Bissau molecular form.
 This result confirms our assumption about independant evolution of Ors genes regions in Bissau molecular form and concorce with IBD result where no share IBD was found around ors genes region between An. gambiae complex.