# Annoy Analysis:
Annoy (Approximate Nearest Neighbors Oh Yeah) is an efficient technique for finding nearest neighbors in large datasets, particularly in applications involving vectors, such as machine learning and information retrieval.

## How Annoy Works:

Data (typically vectors) are added to an index. Each vector represents a numerical representation of an item.

Tree Division: Annoy builds multiple search trees, where each tree is constructed based on a random sample of the data. The idea is to partition the space into regions, allowing for faster neighbor searches.

Search Algorithm:
When querying a vector, Annoy searches through the constructed trees. It traverses the trees to find the nearest neighbors.
The algorithm is designed to be fast, using approximation, which means it might not always find the exact nearest neighbors but frequently finds very close neighbors efficiently.

In [1]:
!pip install annoy

Collecting annoy
  Downloading annoy-1.17.3.tar.gz (647 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/647.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/647.5 kB[0m [31m6.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.3-cp310-cp310-linux_x86_64.whl size=552447 sha256=fe7e2759c647ed1477bbaf56cab15a9b808fc440b50151024d96209564829dfe
  Stored in directory: /root/.cache/pip/wheels/64/8a/da/f714bcf46c5efdcfcac0559e63370c21abe961c48e3992465a
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.3


In [2]:
# Import Libs
import pandas as pd
from annoy import AnnoyIndex

In [3]:
df = pd.read_excel('maize_w2vec_3mer_dataset.xlsx')

In [4]:
df.head(2)

Unnamed: 0,circName,circID,gene,isoform,stress,tissue,chr,start,end,strand,...,wc_3mer_55,wc_3mer_56,wc_3mer_57,wc_3mer_58,wc_3mer_59,wc_3mer_60,wc_3mer_61,wc_3mer_62,wc_3mer_63,wc_3mer_64
0,zma-circ1-Zm00001d002325,2:10317309-10317467_-,Zm00001d002325,Zm00001d002325_T001,-,multipleTissue,2,10317309,10317467,-,...,-22.70613,-10.906356,27.552442,0.419338,-7.335388,-4.650829,13.64414,-0.766124,-20.574353,-42.013476
1,zma-circ2-Zm00001d038675,6:162376852-162378246_+,Zm00001d038675,Zm00001d038675_T004,-,multipleTissue,6,162376852,162378246,+,...,-83.926546,86.007756,13.933029,-58.683456,-49.832305,46.633372,-282.08306,-115.708011,99.670065,-119.454137


In [5]:
dimension = 64

# Drought stress dataset

In [6]:
df_drought = df.query('stress == "-" or stress == "drought"').replace('-', 'control')

In [18]:
df_vec = df_drought.drop(['seq','tissue','chr','start','end','strand','start_anno','circID', 'gene', 'isoform', 'width', 'detection_score', 'stress_detection_score','end_anno', 'antisense', 'algorithm', 'exonSeq',	'predAA',	'miRNA',	'superCircRNARegion'], axis=1)
df_vec

Unnamed: 0,circName,stress,wc_3mer_1,wc_3mer_2,wc_3mer_3,wc_3mer_4,wc_3mer_5,wc_3mer_6,wc_3mer_7,wc_3mer_8,...,wc_3mer_55,wc_3mer_56,wc_3mer_57,wc_3mer_58,wc_3mer_59,wc_3mer_60,wc_3mer_61,wc_3mer_62,wc_3mer_63,wc_3mer_64
0,zma-circ1-Zm00001d002325,control,-18.428862,10.705422,31.344736,5.652841,-0.779334,3.498795,-19.853127,15.140625,...,-22.706130,-10.906356,27.552442,0.419338,-7.335388,-4.650829,13.644140,-0.766124,-20.574353,-42.013476
1,zma-circ2-Zm00001d038675,control,188.770603,-192.573027,-201.852659,-271.642781,119.762526,66.806197,-213.114692,174.141792,...,-83.926546,86.007756,13.933029,-58.683456,-49.832305,46.633372,-282.083060,-115.708011,99.670065,-119.454137
2,zma-circ3-Zm00001d038163,control,-12.990614,25.964586,34.343807,11.887014,-1.077510,-21.205808,20.403219,8.169983,...,19.000492,-27.264513,-23.633633,33.123389,36.564765,8.799955,24.341052,4.967472,-12.689407,-0.401715
3,zma-circ4-Zm00001d049552,control,-12.245401,-0.431559,29.199812,-2.821389,3.386556,10.741132,-10.584288,6.070870,...,-22.908495,7.463494,10.920428,4.524147,-15.538093,-13.341156,-0.296664,-3.016197,-0.477901,-29.496298
4,zma-circ5-Zm00001d032567,control,13.412131,-9.102908,-52.040971,-77.734476,-17.086470,15.942249,-55.777546,34.609959,...,-2.619020,-15.054933,75.592412,-3.791055,26.519955,0.549389,-57.986867,-63.266628,5.492640,-47.422619
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38780,zma-circ38781--,control,-31.971423,46.936561,37.374460,24.961271,-20.605168,-39.592321,42.670800,-18.597109,...,55.547672,-34.047023,-53.817768,24.989076,78.595877,12.960359,41.452756,24.087117,-24.891010,24.140963
38781,zma-circ38782--,control,-7.262153,24.640844,-6.384969,-38.287346,7.022847,-3.239892,-1.993565,-7.791742,...,16.006142,-7.553426,9.093510,10.978614,37.843186,18.243088,-5.548205,-11.582676,-3.474853,13.129991
38782,zma-circ38783--,control,-75.265843,89.033060,39.742789,1.053011,-37.685334,-40.503932,34.544268,-78.418910,...,111.958343,-4.080127,-47.411417,19.781623,169.187258,13.342812,63.040171,58.404351,-10.334180,49.309830
38783,zma-circ38784--,control,82.905541,27.218369,-27.637714,-128.632740,143.095296,-70.905655,106.884221,108.640803,...,67.985388,16.778138,-142.177544,110.127476,96.632890,181.435268,-39.288972,4.967059,35.011970,79.163880


In [19]:
df_vec['stress'].value_counts()

Unnamed: 0_level_0,count
stress,Unnamed: 1_level_1
control,20809
drought,11206


In [23]:
import pandas as pd
from annoy import AnnoyIndex

drought_sequences = df_vec[df_vec['stress'] == 'drought']
control_sequences = df_vec[df_vec['stress'] == 'control']

control_sample = control_sequences.sample(n=11206, random_state=42)

combined_sequences = pd.concat([drought_sequences, control_sample])

t = AnnoyIndex(dimension, 'angular')
for i in range(len(combined_sequences)):
    t.add_item(i, combined_sequences.iloc[i, 2:2 + dimension].values)
t.build(10)

similarities = {}
n_vizinhos = 5  # Number of neighbors

# Analyzing only 'drought' sequences
for estresse_index in drought_sequences.index:
    if estresse_index < len(df_vec):
        adjusted_index = combined_sequences.index.get_loc(estresse_index)

        # Search neighbors
        vizinhos = t.get_nns_by_item(adjusted_index, n_vizinhos + 1)
        valid_vizinhos = []

        for i in vizinhos:
            if combined_sequences.iloc[i]['circName'] != combined_sequences.iloc[adjusted_index]['circName']:
                valid_vizinhos.append(i)
            if len(valid_vizinhos) == n_vizinhos:
                break

        similarities[estresse_index] = valid_vizinhos

# Results
data = []

drought_count = 0
control_neighbors = set()

for estresse_index, vizinhos in similarities.items():
    drought_sequence = combined_sequences.iloc[combined_sequences.index.get_loc(estresse_index)]['circName']
    similar_sequences = [combined_sequences.iloc[i]['circName'] for i in vizinhos]
    similar_conditions = [combined_sequences.iloc[i]['stress'] for i in vizinhos]

    for i in vizinhos:
        if combined_sequences.iloc[i]['stress'] == 'control':
            control_neighbors.add(i)

    drought_count += similar_conditions.count('drought')

    for seq, cond in zip(similar_sequences, similar_conditions):
        data.append({
            'seq_ref_drought': drought_sequence,
            'nearest_neighbor_circrna': seq,
            'condition': cond
        })

control_count = len(control_neighbors)

df_results = pd.DataFrame(data)

# Similarity analysis
total_vizinhos = len(similarities) * n_vizinhos
num_drought_sequences = len(drought_sequences)

print(f'Number of iterated drought stress sequences: {num_drought_sequences}')
print(f'Total number of neighbors in the "drought" group: {drought_count}')
print(f'Total number of neighbors in the "control" group: {control_count}')
print(f'Percentage of neighbors in the "drought" group: {drought_count / total_vizinhos * 100:.2f}%')
print(f'Percentage of neighbors in the "control" group: {control_count / total_vizinhos * 100:.2f}%')

df_results

Number of iterated drought stress sequences: 11206
Total number of neighbors in the "drought" group: 24406
Total number of neighbors in the "control" group: 7881
Percentage of neighbors in the "drought" group: 53.81%
Percentage of neighbors in the "control" group: 17.38%


Unnamed: 0,seq_ref_drought,nearest_neighbor_circrna,condition
0,zma-circ449-Zm00001d010974,zma-circ19279-Zm00001d008559,drought
1,zma-circ449-Zm00001d010974,zma-circ38385--,control
2,zma-circ449-Zm00001d010974,zma-circ19253-Zm00001d050219,drought
3,zma-circ449-Zm00001d010974,zma-circ17961-Zm00001d003713,control
4,zma-circ449-Zm00001d010974,zma-circ21216-Zm00001d029198,control
...,...,...,...
45350,zma-circ32015-Zm00001d051908,zma-circ34643-GRMZM5G859440,drought
45351,zma-circ32015-Zm00001d051908,zma-circ28401--,control
45352,zma-circ32015-Zm00001d051908,zma-circ30688-Zm00001d018077,control
45353,zma-circ32015-Zm00001d051908,zma-circ14230-Zm00001d003148,drought


In [24]:
df_results.to_excel('neighbors_results_drought_maize.xlsx', index=False)