# Annoy Analysis:
Annoy (Approximate Nearest Neighbors Oh Yeah) is an efficient technique for finding nearest neighbors in large datasets, particularly in applications involving vectors, such as machine learning and information retrieval.

## How Annoy Works:

Data (typically vectors) are added to an index. Each vector represents a numerical representation of an item.

Tree Division: Annoy builds multiple search trees, where each tree is constructed based on a random sample of the data. The idea is to partition the space into regions, allowing for faster neighbor searches.

Search Algorithm:
When querying a vector, Annoy searches through the constructed trees. It traverses the trees to find the nearest neighbors.
The algorithm is designed to be fast, using approximation, which means it might not always find the exact nearest neighbors but frequently finds very close neighbors efficiently.

In [1]:
!pip install annoy

Collecting annoy
  Downloading annoy-1.17.3.tar.gz (647 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/647.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m645.1/647.5 kB[0m [31m25.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.3-cp311-cp311-linux_x86_64.whl size=553319 sha256=728aea7959a44b96b0ce334a4e91dddd7be7213506c9cf4fb47b0eccd63a90db
  Stored in directory: /root/.cache/pip/wheels/33/e5/58/0a3e34b92bedf09b4c57e37a63ff395ade6f6c1099ba59877c
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.3


In [1]:
# Import Libs
import pandas as pd
from annoy import AnnoyIndex

In [2]:
df = pd.read_excel('maize_w2vec_3mer_dataset.xlsx')

In [3]:
df.head(2)

Unnamed: 0,circName,circID,gene,isoform,stress,tissue,chr,start,end,strand,...,wc_3mer_55,wc_3mer_56,wc_3mer_57,wc_3mer_58,wc_3mer_59,wc_3mer_60,wc_3mer_61,wc_3mer_62,wc_3mer_63,wc_3mer_64
0,zma-circ1-Zm00001d002325,2:10317309-10317467_-,Zm00001d002325,Zm00001d002325_T001,-,multipleTissue,2,10317309,10317467,-,...,-22.70613,-10.906356,27.552442,0.419338,-7.335388,-4.650829,13.64414,-0.766124,-20.574353,-42.013476
1,zma-circ2-Zm00001d038675,6:162376852-162378246_+,Zm00001d038675,Zm00001d038675_T004,-,multipleTissue,6,162376852,162378246,+,...,-83.926546,86.007756,13.933029,-58.683456,-49.832305,46.633372,-282.08306,-115.708011,99.670065,-119.454137


In [4]:
dimension = 64

# Drought stress dataset

In [5]:
df_drought = df.query('stress == "-" or stress == "drought"').replace('-', 'control')

In [6]:
df_vec = df_drought.drop(['seq','tissue','chr','start','end','strand','start_anno','circID', 'gene', 'isoform', 'width', 'detection_score', 'stress_detection_score','end_anno', 'antisense', 'algorithm', 'exonSeq',	'predAA',	'miRNA',	'superCircRNARegion'], axis=1)
df_vec

Unnamed: 0,circName,stress,wc_3mer_1,wc_3mer_2,wc_3mer_3,wc_3mer_4,wc_3mer_5,wc_3mer_6,wc_3mer_7,wc_3mer_8,...,wc_3mer_55,wc_3mer_56,wc_3mer_57,wc_3mer_58,wc_3mer_59,wc_3mer_60,wc_3mer_61,wc_3mer_62,wc_3mer_63,wc_3mer_64
0,zma-circ1-Zm00001d002325,control,-18.428862,10.705422,31.344736,5.652841,-0.779334,3.498795,-19.853127,15.140625,...,-22.706130,-10.906356,27.552442,0.419338,-7.335388,-4.650829,13.644140,-0.766124,-20.574353,-42.013476
1,zma-circ2-Zm00001d038675,control,188.770603,-192.573027,-201.852659,-271.642781,119.762526,66.806197,-213.114692,174.141792,...,-83.926546,86.007756,13.933029,-58.683456,-49.832305,46.633372,-282.083060,-115.708011,99.670065,-119.454137
2,zma-circ3-Zm00001d038163,control,-12.990614,25.964586,34.343807,11.887014,-1.077510,-21.205808,20.403219,8.169983,...,19.000492,-27.264513,-23.633633,33.123389,36.564765,8.799955,24.341052,4.967472,-12.689407,-0.401715
3,zma-circ4-Zm00001d049552,control,-12.245401,-0.431559,29.199812,-2.821389,3.386556,10.741132,-10.584288,6.070870,...,-22.908495,7.463494,10.920428,4.524147,-15.538093,-13.341156,-0.296664,-3.016197,-0.477901,-29.496298
4,zma-circ5-Zm00001d032567,control,13.412131,-9.102908,-52.040971,-77.734476,-17.086470,15.942249,-55.777546,34.609959,...,-2.619020,-15.054933,75.592412,-3.791055,26.519955,0.549389,-57.986867,-63.266628,5.492640,-47.422619
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38780,zma-circ38781--,control,-31.971423,46.936561,37.374460,24.961271,-20.605168,-39.592321,42.670800,-18.597109,...,55.547672,-34.047023,-53.817768,24.989076,78.595877,12.960359,41.452756,24.087117,-24.891010,24.140963
38781,zma-circ38782--,control,-7.262153,24.640844,-6.384969,-38.287346,7.022847,-3.239892,-1.993565,-7.791742,...,16.006142,-7.553426,9.093510,10.978614,37.843186,18.243088,-5.548205,-11.582676,-3.474853,13.129991
38782,zma-circ38783--,control,-75.265843,89.033060,39.742789,1.053011,-37.685334,-40.503932,34.544268,-78.418910,...,111.958343,-4.080127,-47.411417,19.781623,169.187258,13.342812,63.040171,58.404351,-10.334180,49.309830
38783,zma-circ38784--,control,82.905541,27.218369,-27.637714,-128.632740,143.095296,-70.905655,106.884221,108.640803,...,67.985388,16.778138,-142.177544,110.127476,96.632890,181.435268,-39.288972,4.967059,35.011970,79.163880


In [7]:
df_vec['stress'].value_counts()

Unnamed: 0_level_0,count
stress,Unnamed: 1_level_1
control,20809
drought,11206


In [11]:
drought_sequences = df_vec[df_vec['stress'] == 'drought']
control_sequences = df_vec[df_vec['stress'] == 'control']

# Controlled sample of "control" sequences
control_sample = control_sequences.sample(n=11206, random_state=40)

# Combine "drought" and "control" sequences
combined_sequences = pd.concat([drought_sequences, control_sample])

# Initialize the Annoy index
t = AnnoyIndex(dimension, 'angular')
for i in range(len(combined_sequences)):
    t.add_item(i, combined_sequences.iloc[i, 2:2 + dimension].values)
t.build(10)

# Initialize data structures
similarities = {}
n_neighbors = 5  # Desired number of neighbors

valid_neighbors_count = []  # Count of valid neighbors
drought_without_neighbors = 0  # Counter for sequences with no valid neighbors
total_neighbors_found = 0  # Counter for found neighbors
sequences_with_less_than_5_neighbors = []  # For sequences with less than 5 valid neighbors

drought_count = 0
control_count = 0  # Count of neighbors in the "control" group

# Process the "drought" sequences
for stress_index in drought_sequences.index:
    adjusted_index = combined_sequences.index.get_loc(stress_index)

    # Search for neighbors
    neighbors = t.get_nns_by_item(adjusted_index, n_neighbors + 1)
    valid_neighbors = []

    for i in neighbors:
        if combined_sequences.iloc[i]['circName'] != combined_sequences.iloc[adjusted_index]['circName']:
            valid_neighbors.append(i)
        if len(valid_neighbors) == n_neighbors:
            break

    # Check if 5 valid neighbors were found
    if len(valid_neighbors) < n_neighbors:
        valid_neighbors_count.append(0)
        drought_without_neighbors += 1
        sequences_with_less_than_5_neighbors.append(combined_sequences.iloc[adjusted_index]['circName'])
    else:
        valid_neighbors_count.append(len(valid_neighbors))

    total_neighbors_found += len(valid_neighbors)  # Total neighbors found
    similarities[stress_index] = valid_neighbors  # Add the found neighbors for the sequence

    # Count neighbors for "drought" and "control"
    for i in valid_neighbors:
        neighbor_condition = combined_sequences.iloc[i]['stress']
        if neighbor_condition == 'drought':
            drought_count += 1
        elif neighbor_condition == 'control':
            control_count += 1

# Display the count of valid neighbors per "drought" sequence
print("Valid neighbors count per 'drought' sequence:")
print(valid_neighbors_count)

# Display the number of sequences with no neighbors
print(f'Number of iterated drought stress sequences without neighbors: {drought_without_neighbors}')
print(f'Total number of neighbors found: {total_neighbors_found}')  # Verifying the total number of neighbors found

# Print the sequences with less than 5 valid neighbors
print("Sequences with less than 5 valid neighbors:")
print(sequences_with_less_than_5_neighbors)

# Creating the final DataFrame
data = []

# Populate the DataFrame with found neighbors
for stress_index, neighbors in similarities.items():
    drought_sequence = combined_sequences.iloc[combined_sequences.index.get_loc(stress_index)]['circName']

    # If the "drought" sequence found less than 5 valid neighbors
    if len(neighbors) == 0:
        data.append({
            'seq_ref_drought': drought_sequence,
            'nearest_neighbor_circrna': '',
            'condition': ''
        })
    else:
        # Add the neighbors to the DataFrame
        for i in range(len(neighbors)):
            neighbor_sequence = combined_sequences.iloc[neighbors[i]]['circName']
            neighbor_condition = combined_sequences.iloc[neighbors[i]]['stress']
            data.append({
                'seq_ref_drought': drought_sequence,
                'nearest_neighbor_circrna': neighbor_sequence,
                'condition': neighbor_condition
            })

# Add sequences with no valid neighbors to the DataFrame
for seq in sequences_with_less_than_5_neighbors:
    data.append({
        'seq_ref_drought': seq,
        'nearest_neighbor_circrna': '',
        'condition': ''
    })

# Create the final DataFrame
df_results = pd.DataFrame(data)

# Similarity analysis
total_neighbors = len(similarities) * n_neighbors
num_drought_sequences = len(drought_sequences)

print(f'Number of iterated drought stress sequences: {num_drought_sequences}')
print(f'Total number of neighbors in the "drought" group: {drought_count}')
print(f'Total number of neighbors in the "control" group: {control_count}')
print(f'Percentage of neighbors in the "drought" group: {drought_count / total_neighbors * 100:.2f}%')
print(f'Percentage of neighbors in the "control" group: {control_count / total_neighbors * 100:.2f}%')

# Verifying the number of rows in the final DataFrame
print(f'Total number of rows in df_results: {len(df_results)}')

df_results

Valid neighbors count per 'drought' sequence:
[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,

Unnamed: 0,seq_ref_drought,nearest_neighbor_circrna,condition
0,zma-circ449-Zm00001d010974,zma-circ19279-Zm00001d008559,drought
1,zma-circ449-Zm00001d010974,zma-circ298-Zm00001d029571,control
2,zma-circ449-Zm00001d010974,zma-circ38385--,control
3,zma-circ449-Zm00001d010974,zma-circ19389-Zm00001d038104,drought
4,zma-circ449-Zm00001d010974,zma-circ19253-Zm00001d050219,drought
...,...,...,...
56025,zma-circ37973-Zm00001d048176,zma-circ30713-Zm00001d048176,control
56026,zma-circ37973-Zm00001d048176,zma-circ37657-Zm00001d048176,control
56027,zma-circ37973-Zm00001d048176,zma-circ31024-Zm00001d048176,control
56028,zma-circ37973-Zm00001d048176,zma-circ37633-Zm00001d048176,control


In [12]:
df_results.to_excel('neighbors_results_drought_maize.xlsx', index=False)