## Connect to Google Drive and download the csv file

In [45]:
# https://drive.google.com/file/d/1I32ThMi6N3ESoPz2GvXfo6q1DBz1iW0o/view?usp=sharing
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

file_id = '1I32ThMi6N3ESoPz2GvXfo6q1DBz1iW0o'
downloaded = drive.CreateFile({'id':file_id})
downloaded.FetchMetadata(fetch_all=True)
downloaded.GetContentFile(downloaded.metadata['title'])

#### Unzip the file

In [46]:
!tar -xf java_posts_2016_515947_464652.zip

#### Display file size

In [47]:
!du -sh java_posts_2016_515947_464652.csv

1.1G	java_posts_2016_515947_464652.csv


In [48]:
import pandas as pd
java_posts = pd.read_csv('java_posts_2016_515947_464652.csv')
java_posts.shape

(515947, 25)

In [49]:
print(len(java_posts))
print(len(java_posts[java_posts['post_id']!=java_posts['id']]))
print(java_posts['related_post_id'].isnull().sum())

515947
355675
355675


In [50]:
java_posts.head()

Unnamed: 0,id,title,body,accepted_answer_id,answer_count,comment_count,community_owned_date,creation_date,favorite_count,last_activity_date,...,parent_id,post_type_id,score,tags,view_count,id_1,creation_date_1,link_type_id,post_id,related_post_id
0,38037695,How to fix java.lang.NoClassDefFoundError when...,<p>I am trying to create a runnable jar from t...,,2,0,,2016-06-26 10:46:46.907000+00:00,,2016-06-27 03:51:46.987000+00:00,...,,1,0,java|intellij-idea|jar,4104,1221201000.0,2016-06-27 03:51:46.987000+00:00,1.0,38037695.0,20952713.0
1,37748461,"TreeMap in TreeMap , can't get value from seco...",<p>I have this task.\nInput is </p>\n\n<pre><c...,37748671.0,1,0,,2016-06-10 12:35:40.593000+00:00,,2016-06-10 12:52:45.403000+00:00,...,,1,0,java|treemap,65,,,,,
2,37846762,Injecting JSON Data From one Function to Anoth...,"<p>I am currently in a node, angular, express,...",,2,0,,2016-06-15 22:19:40.213000+00:00,,2016-06-17 23:58:39.357000+00:00,...,,1,1,javascript|jquery|angularjs,75,,,,,
3,37791163,JavaFX PrintAPI wrong PaperSource,<p>I'm using the JavaFx Print-Dialog to custom...,38003854.0,3,0,,2016-06-13 13:33:16.687000+00:00,2.0,2017-09-06 21:13:02.367000+00:00,...,,1,7,java|javafx|printing,887,,,,,
4,37996445,How to trigger node filename.js file in Electron?,<p>I'm trying to trigger one JS file from Elec...,,2,0,,2016-06-23 15:54:15.880000+00:00,,2017-07-25 15:32:59.160000+00:00,...,,1,1,javascript|node.js|electron|child-process,124,1220824000.0,2016-06-24 14:35:18.410000+00:00,1.0,37996445.0,27688804.0


In [51]:
java_posts.columns

Index(['id', 'title', 'body', 'accepted_answer_id', 'answer_count',
       'comment_count', 'community_owned_date', 'creation_date',
       'favorite_count', 'last_activity_date', 'last_edit_date',
       'last_editor_display_name', 'last_editor_user_id', 'owner_display_name',
       'owner_user_id', 'parent_id', 'post_type_id', 'score', 'tags',
       'view_count', 'id_1', 'creation_date_1', 'link_type_id', 'post_id',
       'related_post_id'],
      dtype='object')

In [52]:
columns_to_drop = [
    'accepted_answer_id', 'answer_count', 'comment_count', 'community_owned_date',
    'creation_date', 'favorite_count', 'last_activity_date', 'last_edit_date',
    'last_editor_display_name', 'last_editor_user_id', 'owner_display_name',
    'owner_user_id', 'parent_id', 'post_type_id', 'score', 'tags',
    'view_count', 'id_1', 'creation_date_1', 'link_type_id'
]
df = java_posts.drop(columns=columns_to_drop)

df = df.dropna(subset=['post_id'])
df = df.dropna(subset=['related_post_id'])
df = df[df['post_id']>0]
df = df[df['related_post_id']>0]

df['post_id'] = df['post_id'].astype(int)
df['related_post_id'] = df['related_post_id'].astype(int)

In [53]:
df.shape

(160272, 5)

In [58]:
df.head(30)


Unnamed: 0,id,title,body,post_id,related_post_id
0,38037695,How to fix java.lang.NoClassDefFoundError when...,<p>I am trying to create a runnable jar from t...,38037695,20952713
4,37996445,How to trigger node filename.js file in Electron?,<p>I'm trying to trigger one JS file from Elec...,37996445,27688804
5,37721013,How to download a large Zip from GAE,<p>I used the methods suggested here: \n<a hre...,37721013,24603201
10,37799719,How to take a text input from my Java program ...,"<p>So, say I type ""message"" into my program. H...",37799719,37797886
11,37625867,Make Search Box GET Page on Enter,<p>We want to make the following search box GE...,37625867,503093
16,37643092,UnsatisfiedLinkError for libMagick.so.10 while...,<p>I am trying to run a program which uses JMa...,37643092,37513726
17,38045996,Push to array in nested ng-repeat,"<p><div class=""snippet"" data-lang=""js"" data-hi...",38045996,21422510
28,37217280,Getting an HTTP Status 500 - org.hibernate.int...,"<p>I am working on a project using Java, Sprin...",37217280,35725306
30,37144961,How to change app:elevation programmatically,<p>My app is work for API 19. So I can't use a...,37144961,33054316
31,37358184,How to Use D3 to Angular Project from Bower In...,<p>I am trying to include D3 into my Angular A...,37358184,22434742


In [59]:
valid_related_post_ids=[int(x) for x in df['related_post_id'] if len(df[df['post_id']==x])>0]

In [60]:
len(valid_related_post_ids)

5302

In [62]:
data = []

for id, x in enumerate(valid_related_post_ids):
    master_question = df.loc[df['post_id'] == x, 'title'].iloc[0]
    nonmaster_question = df.loc[df['related_post_id'] == x, 'title'].iloc[0]

    data.append({
        'index': id,
        'master_questions': master_question,
        'nonmaster_questions': nonmaster_question,
        'duplicate': 1
    })

# تبدیل به DataFrame
result_df = pd.DataFrame(data)


In [63]:
len(result_df)

5302

In [64]:
result_df.head(30)

Unnamed: 0,index,master_questions,nonmaster_questions,duplicate
0,0,org.hibernate.internal.util.config.Configurati...,Getting an HTTP Status 500 - org.hibernate.int...,1
1,1,Error Occurs while Running Android Project - U...,Android project cannot run in myeclipse,1
2,2,How can I enable IntelliSense for JavaScript i...,Visual Studio autocomplete not working for jav...,1
3,3,How to use a groupBy with json file angularjs,How to use a groupBy filter in controller leve...,1
4,4,How to fetch password from Alfresco UI page wh...,Auto-generation of email with username and ran...,1
5,5,FabricJs and Polygon transformed coordinates,How does transforming points with a transformM...,1
6,6,How can I get my java program running on GPU ？...,How to accelerate my program？,1
7,7,Converting Time & Date to relative time (CSV p...,Saving Data from a JavaFX-Application without ...,1
8,8,Open local pdf file from a local html file,How to open a pdf document on click of hyper l...,1
9,9,Jackson custom filter with full POJO data bind,Conditional field requirement based on another...,1


In [61]:
duplicate_post=df[(df['post_id'].isin(valid_related_post_ids)) | (df['related_post_id'].isin(valid_related_post_ids))]

In [14]:
duplicate_post

Unnamed: 0,id,title,body,post_id,related_post_id
28,37217280,Getting an HTTP Status 500 - org.hibernate.int...,"<p>I am working on a project using Java, Sprin...",37217280,35725306
122,36193602,Android project cannot run in myeclipse,<p>I create a HelloWorld Android application p...,36193602,36008207
143,38687262,Visual Studio autocomplete not working for jav...,<p>I'm trying out <code>visual studio</code> c...,38687262,37055382
283,34832524,Closing mapped streams - what's the idea?,<p>It's well known that Javadoc says about <co...,34832524,34072035
299,37555031,Why does .json() return a promise?,<p>I've been messing around with the <code>fet...,37555031,28250680
...,...,...,...,...,...
515835,34869352,How to declare private variables and private m...,<p><strong>in es5 we use constructor function<...,34869352,34418012
515867,39199544,How to get today's date in java if system date...,<p>My motive is to get current date without ca...,39199544,38922754
515896,37082744,Split one quadratic bezier curve into two,<p>So I have an imaginary circle divided into ...,37082744,17083580
515922,40968998,Calling an exe-File with JVM Arguments,<p>i am currently developing a java-Programm t...,40968998,40933950


In [15]:
duplicate_post.columns

Index(['id', 'title', 'body', 'post_id', 'related_post_id'], dtype='object')

In [16]:
import pandas as pd
from tqdm import tqdm
merged_df = duplicate_post.merge(duplicate_post, left_on='related_post_id', right_on='post_id', suffixes=('_master', '_nonmaster'))

data = []
for id, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc='Processing rows'):
    data.append({
        'index': id,
        'master_questions': row['title_master'],
        'nonmaster_questions': row['title_nonmaster'],
        'duplicate': 1
    })

result_df = pd.DataFrame(data)



Processing rows: 100%|██████████| 9498/9498 [00:00<00:00, 10002.88it/s]


In [65]:
not_duplicate_df = java_posts[java_posts['related_post_id'].isna()]
len(not_duplicate_df)
import pandas as pd
import numpy as np

sample_size = 5302

if len(not_duplicate_df) < sample_size * 2:
    raise ValueError("DataFrame does not contain enough records for sampling")

data = []
for i in range(sample_size):
    master_row = not_duplicate_df.sample(n=1, random_state=np.random.randint(0, 10000)).iloc[0]

    nonmaster_row = not_duplicate_df.sample(n=1, random_state=np.random.randint(0, 10000)).iloc[0]
    i = i + 5302
    data.append({
        'index': i,
        'master_questions': master_row['title'],
        'nonmaster_questions': nonmaster_row['title'],
        'duplicate': 0
    })

data_df = pd.DataFrame(data)

print(data_df.head())


   index                                   master_questions  \
0   5302  Assgining textures of .mtl file in OpenGL ES 2...   
1   5303   JQuery Script not checking check-boxes correctly   
2   5304  Why are the Angularjs Material Cards being pus...   
3   5305      Error 404, not finding scripts on my computer   
4   5306            Mocking an injected field in unit tests   

                                 nonmaster_questions  duplicate  
0  Jquery Custom event from page loaded with AJAX...          0  
1  do not fail spring container if bean does not ...          0  
2           Angularjs simple grid Table sort by date          0  
3  Attempting to use an incompatible return type ...          0  
4  How to import ssl wildcard cert from Apache to...          0  


In [67]:
final_df = pd.concat([result_df, data_df], ignore_index=True)

# Display the head of the final DataFrame
print(final_df[final_df['index']==3000])

      index                                   master_questions  \
3000   3000  Can't fit file encoding when working with Chro...   

                                    nonmaster_questions  duplicate  
3000  Can't fit file encoding when writing a file wi...          1  


In [68]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models, Input, Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout, concatenate
from tensorflow.keras.optimizers import Adam

# Sample data loading
# Assuming final_df is already defined and loaded with data


# Preprocess the data
df = final_df.dropna().reset_index(drop=True)
df["question1"] = df["master_questions"].str.lower().str.replace("[^\w\s]", "", regex=True).str.replace("\d+", "", regex=True).str.replace("\r", "", regex=True).str.replace("\n", "", regex=True)
df["question2"] = df["nonmaster_questions"].str.lower().str.replace("[^\w\s]", "", regex=True).str.replace("\d+", "", regex=True).str.replace("\r", "", regex=True).str.replace("\n", "", regex=True)



# Prepare the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df["question1"].tolist() + df["question2"].tolist())

# Convert text to sequences
X1 = tokenizer.texts_to_sequences(df["question1"])
X2 = tokenizer.texts_to_sequences(df["question2"])

# Pad sequences to ensure equal length
max_length = max(max(len(seq) for seq in X1), max(len(seq) for seq in X2))
X1 = pad_sequences(X1, maxlen=max_length, padding='post')
X2 = pad_sequences(X2, maxlen=max_length, padding='post')

# Ensure all sequences are within the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
X1 = [[min(word, vocab_size - 1) for word in seq] for seq in X1]
X2 = [[min(word, vocab_size - 1) for word in seq] for seq in X2]

# Concatenate question pairs
X = [x1 + x2 for x1, x2 in zip(X1, X2)]
X = pad_sequences(X, maxlen=2 * max_length, padding='post')

# Prepare labels
y = df["duplicate"].values

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
embedding_dim = 128

model = models.Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=2 * max_length))
model.add(LSTM(32, return_sequences=True))
model.add(LSTM(64))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, batch_size=64, epochs=10, validation_split=0.1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Loss: {loss}, Accuracy: {accuracy}')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss: 0.6718423366546631, Accuracy: 0.605374813079834


In [84]:
import numpy as np

# Randomly choose an index from the test set
random_index = np.random.choice(len(X_test))

# Extract the question pairs at the chosen index
question1 = X_test[random_index][:max_length]
question2 = X_test[random_index][max_length:]

# Reshape to fit model input shape (batch_size, 2 * max_length)
question1 = np.reshape(question1, (1, max_length))
question2 = np.reshape(question2, (1, max_length))

# Concatenate question pairs
concatenated = np.concatenate((question1, question2), axis=1)

# Predict the label
prediction = model.predict(concatenated)

# Retrieve the original label
original_label = y_test[random_index]

# Print the results
print("Question 1:", tokenizer.sequences_to_texts([question1[0]])[0])
print("Question 2:", tokenizer.sequences_to_texts([question2[0]])[0])
print("Predicted Label:", prediction[0][0])
print("Original Label:", original_label)


Question 1: retrieve data dynamically from mysql into php page
Question 2: gridview issue on a tabbed activity
Predicted Label: 0.36830342
Original Label: 0
